From 8b712cd58adfe6aeeb0be4ecc011dc35620719e7 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 7 Jul 2009 17:22:12 -0400 Subject: [PATCH 01/60] ocfs2: Fixup orphan scan cleanup after failed mount If the mount fails for any reason, ocfs2_dismount_volume calls ocfs2_orphan_scan_stop. It requires that ocfs2_orphan_scan_init be called to setup the mutex and work queues, but that doesn't happen if the mount has failed and we oops accessing an uninitialized work queue. This patch splits the init and startup of the orphan scan, eliminating the oops. Signed-off-by: Jeff Mahoney Signed-off-by: Joel Becker --- fs/ocfs2/journal.c | 8 +++++++- fs/ocfs2/journal.h | 1 + fs/ocfs2/super.c | 4 +++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index f033760ecbe..c48b93ac6b6 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1954,10 +1954,16 @@ void ocfs2_orphan_scan_init(struct ocfs2_super *osb) os->os_osb = osb; os->os_count = 0; os->os_seqno = 0; - os->os_scantime = CURRENT_TIME; mutex_init(&os->os_lock); INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work); +} +void ocfs2_orphan_scan_start(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + + os = &osb->osb_orphan_scan; + os->os_scantime = CURRENT_TIME; if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); else { diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 5432c7f79cc..81e8abcd246 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -145,6 +145,7 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, /* Exported only for the journal struct init code in super.c. Do not call. */ void ocfs2_orphan_scan_init(struct ocfs2_super *osb); +void ocfs2_orphan_scan_start(struct ocfs2_super *osb); void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 7efb349fb9b..63af2e36d83 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1182,7 +1182,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) wake_up(&osb->osb_mount_event); /* Start this when the mount is almost sure of being successful */ - ocfs2_orphan_scan_init(osb); + ocfs2_orphan_scan_start(osb); mlog_exit(status); return status; @@ -1981,6 +1981,8 @@ static int ocfs2_initialize_super(struct super_block *sb, snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + ocfs2_orphan_scan_init(osb); + status = ocfs2_recovery_init(osb); if (status) { mlog(ML_ERROR, "Unable to initialize recovery state\n"); From 17ae26b669886efe237b77439e43eb390fceb119 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Tue, 7 Jul 2009 15:51:40 +0800 Subject: [PATCH 02/60] ocfs2: trivial fix for s/migrate/migration/ in dlmrecovery.c logging in dlmrecovery.c:1121, replace 'migrate' to 'migration' to keep the consistency by comparing to other lines with the similar log info in the same file. Signed-off-by: Jeff Liu Signed-off-by: Joel Becker --- fs/ocfs2/dlm/dlmrecovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index bcb9260c373..43e6e328056 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1118,7 +1118,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n", dlm->name, res->lockname.len, res->lockname.name, - orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery", + orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery", send_to); /* send it */ From 812e7a6a43fc34bc8f70c2b80db4ea5997d66ea8 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Fri, 10 Jul 2009 13:26:04 +0800 Subject: [PATCH 03/60] ocfs2: log the actual return value of ocfs2_file_aio_write() in ocfs2_file_aio_write(), log_exit() could don't log the value which is really returned. this patch fixes it. Signed-off-by: Wengang Wang Signed-off-by: Joel Becker --- fs/ocfs2/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 62442e413a0..a49fa44aea1 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1918,8 +1918,10 @@ out_sems: mutex_unlock(&inode->i_mutex); + if (written) + ret = written; mlog_exit(ret); - return written ? written : ret; + return ret; } static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, From cefcb800fa9536bb6f29485c53e6c82a65b0c022 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Sat, 11 Jul 2009 10:57:27 -0500 Subject: [PATCH 04/60] ocfs2: Initialize count in aio_write before generic_write_checks generic_write_checks() expects count to be initialized to the size of the write. Writes to files open with O_DIRECT|O_LARGEFILE write 0 bytes because count is uninitialized. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Joel Becker --- fs/ocfs2/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a49fa44aea1..aa501d3f93f 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1851,6 +1851,7 @@ relock: if (ret) goto out_dio; + count = ocount; ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); if (ret) From 44d8e4e1e9b941065eb7578669fe51eb65c73e63 Mon Sep 17 00:00:00 2001 From: Subrata Modak Date: Tue, 14 Jul 2009 01:19:31 +0530 Subject: [PATCH 05/60] ocfs2: Fix compilation warning for fs/ocfs2/xattr.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc 4.4.1 generates the following build warning on i386: CC [M] fs/ocfs2/xattr.o fs/ocfs2/xattr.c: In function ‘ocfs2_xattr_block_get’: fs/ocfs2/xattr.c:1055: warning: ‘block_off’ may be used uninitialized in this function The following fix is based on a similar approach by David Howells few days back: http://lkml.org/lkml/2009/7/9/109, Signed-off-by: Subrata Modak, Signed-off-by: Joel Becker --- fs/ocfs2/xattr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index ba320e25074..d1a27cda984 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1052,7 +1052,8 @@ static int ocfs2_xattr_block_get(struct inode *inode, struct ocfs2_xattr_block *xb; struct ocfs2_xattr_value_root *xv; size_t size; - int ret = -ENODATA, name_offset, name_len, block_off, i; + int ret = -ENODATA, name_offset, name_len, i; + int uninitialized_var(block_off); xs->bucket = ocfs2_xattr_bucket_new(inode); if (!xs->bucket) { From cbfa9639aea5007313b75ec43b689d5f9e0c037d Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Mon, 13 Jul 2009 11:38:23 +0800 Subject: [PATCH 06/60] ocfs2: Fix error return in ocfs2_write_cluster() A typo caused ocfs2_write_cluster() to return 0 in some error cases. Fix it. Signed-off-by: Wengang Wang Signed-off-by: Joel Becker --- fs/ocfs2/aops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index b2c52b3a148..25aced3b0c7 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1301,7 +1301,7 @@ static int ocfs2_write_cluster(struct address_space *mapping, if (tmpret) { mlog_errno(tmpret); if (ret == 0) - tmpret = ret; + ret = tmpret; } } From 1f4cea3790bf44137192f441ee84af256e3bf809 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Mon, 13 Jul 2009 11:38:58 +0800 Subject: [PATCH 07/60] ocfs2: Fail ocfs2_get_block() immediately when a block needs allocation ocfs2_get_block() does no allocation. Hole filling for writes should have happened farther up in the call chain. We detect this case and print an error, but we then continue with the function. We should be exiting immediately. Signed-off-by: Wengang Wang Signed-off-by: Joel Becker --- fs/ocfs2/aops.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 25aced3b0c7..e511df10145 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -193,6 +193,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, (unsigned long long)OCFS2_I(inode)->ip_blkno); mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); dump_stack(); + goto bail; } past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); From 3c5e10683e684ef45614c9071847e48f633d9806 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 21 Jul 2009 15:42:05 +0800 Subject: [PATCH 08/60] ocfs2: Add extra credits and access the modified bh in update_edge_lengths. In normal tree rotation left process, we will never touch the tree branch above subtree_index and ocfs2_extend_rotate_transaction doesn't reserve the credits for them either. But when we want to delete the rightmost extent block, we have to update the rightmost records for all the rightmost branch(See ocfs2_update_edge_lengths), so we have to allocate extra credits for them. What's more, we have to access them also. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/alloc.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9edcde4974a..11085af7124 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -2476,15 +2476,37 @@ out_ret_path: return ret; } -static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, - struct ocfs2_path *path) +static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, + int subtree_index, struct ocfs2_path *path) { - int i, idx; + int i, idx, ret; struct ocfs2_extent_rec *rec; struct ocfs2_extent_list *el; struct ocfs2_extent_block *eb; u32 range; + /* + * In normal tree rotation process, we will never touch the + * tree branch above subtree_index and ocfs2_extend_rotate_transaction + * doesn't reserve the credits for them either. + * + * But we do have a special case here which will update the rightmost + * records for all the bh in the path. + * So we have to allocate extra credits and access them. + */ + ret = ocfs2_extend_trans(handle, + handle->h_buffer_credits + subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(inode, handle, path); + if (ret) { + mlog_errno(ret); + goto out; + } + /* Path should always be rightmost. */ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; BUG_ON(eb->h_next_leaf_blk != 0ULL); @@ -2505,6 +2527,8 @@ static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, ocfs2_journal_dirty(handle, path->p_node[i].bh); } +out: + return ret; } static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, @@ -2717,7 +2741,12 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, if (del_right_subtree) { ocfs2_unlink_subtree(inode, handle, left_path, right_path, subtree_index, dealloc); - ocfs2_update_edge_lengths(inode, handle, left_path); + ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); @@ -3034,7 +3063,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, ocfs2_unlink_subtree(inode, handle, left_path, path, subtree_index, dealloc); - ocfs2_update_edge_lengths(inode, handle, left_path); + ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); From f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 20 Jul 2009 12:12:36 +0200 Subject: [PATCH 09/60] ocfs2: Fix deadlock on umount In commit ea455f8ab68338ba69f5d3362b342c115bea8e13, we moved the dentry lock put process into ocfs2_wq. This causes problems during umount because ocfs2_wq can drop references to inodes while they are being invalidated by invalidate_inodes() causing all sorts of nasty things (invalidate_inodes() ending in an infinite loop, "Busy inodes after umount" messages etc.). We fix the problem by stopping ocfs2_wq from doing any further releasing of inode references on the superblock being unmounted, wait until it finishes the current round of releasing and finally cleaning up all the references in dentry_lock_list from ocfs2_put_super(). The issue was tracked down by Tao Ma . Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/dcache.c | 35 +++++++++++++++++++++++++++-------- fs/ocfs2/dcache.h | 3 +++ fs/ocfs2/ocfs2.h | 22 ++++++++++++++++++---- fs/ocfs2/super.c | 25 ++++++++++++++++++++++--- 4 files changed, 70 insertions(+), 15 deletions(-) diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index b574431a031..2f28b7de2c8 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -310,22 +310,19 @@ out_attach: return ret; } -static DEFINE_SPINLOCK(dentry_list_lock); +DEFINE_SPINLOCK(dentry_list_lock); /* We limit the number of dentry locks to drop in one go. We have * this limit so that we don't starve other users of ocfs2_wq. */ #define DL_INODE_DROP_COUNT 64 /* Drop inode references from dentry locks */ -void ocfs2_drop_dl_inodes(struct work_struct *work) +static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) { - struct ocfs2_super *osb = container_of(work, struct ocfs2_super, - dentry_lock_work); struct ocfs2_dentry_lock *dl; - int drop_count = DL_INODE_DROP_COUNT; spin_lock(&dentry_list_lock); - while (osb->dentry_lock_list && drop_count--) { + while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { dl = osb->dentry_lock_list; osb->dentry_lock_list = dl->dl_next; spin_unlock(&dentry_list_lock); @@ -333,11 +330,32 @@ void ocfs2_drop_dl_inodes(struct work_struct *work) kfree(dl); spin_lock(&dentry_list_lock); } - if (osb->dentry_lock_list) + spin_unlock(&dentry_list_lock); +} + +void ocfs2_drop_dl_inodes(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, + dentry_lock_work); + + __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); + /* + * Don't queue dropping if umount is in progress. We flush the + * list in ocfs2_dismount_volume + */ + spin_lock(&dentry_list_lock); + if (osb->dentry_lock_list && + !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) queue_work(ocfs2_wq, &osb->dentry_lock_work); spin_unlock(&dentry_list_lock); } +/* Flush the whole work queue */ +void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) +{ + __ocfs2_drop_dl_inodes(osb, -1); +} + /* * ocfs2_dentry_iput() and friends. * @@ -368,7 +386,8 @@ static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, /* We leave dropping of inode reference to ocfs2_wq as that can * possibly lead to inode deletion which gets tricky */ spin_lock(&dentry_list_lock); - if (!osb->dentry_lock_list) + if (!osb->dentry_lock_list && + !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) queue_work(ocfs2_wq, &osb->dentry_lock_work); dl->dl_next = osb->dentry_lock_list; osb->dentry_lock_list = dl; diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index faa12e75f98..f5dd1789acf 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -49,10 +49,13 @@ struct ocfs2_dentry_lock { int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, u64 parent_blkno); +extern spinlock_t dentry_list_lock; + void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl); void ocfs2_drop_dl_inodes(struct work_struct *work); +void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, int skip_unhashed); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index c9345ebb849..39e1d5a3950 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -224,10 +224,12 @@ enum ocfs2_mount_options OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */ }; -#define OCFS2_OSB_SOFT_RO 0x0001 -#define OCFS2_OSB_HARD_RO 0x0002 -#define OCFS2_OSB_ERROR_FS 0x0004 -#define OCFS2_DEFAULT_ATIME_QUANTUM 60 +#define OCFS2_OSB_SOFT_RO 0x0001 +#define OCFS2_OSB_HARD_RO 0x0002 +#define OCFS2_OSB_ERROR_FS 0x0004 +#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 + +#define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_journal; struct ocfs2_slot_info; @@ -490,6 +492,18 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); } + +static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb, + unsigned long flag) +{ + unsigned long ret; + + spin_lock(&osb->osb_lock); + ret = osb->osb_flags & flag; + spin_unlock(&osb->osb_lock); + return ret; +} + static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, int hard) { diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 63af2e36d83..f2893878f8a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1213,14 +1213,27 @@ static int ocfs2_get_sb(struct file_system_type *fs_type, mnt); } +static void ocfs2_kill_sb(struct super_block *sb) +{ + struct ocfs2_super *osb = OCFS2_SB(sb); + + /* Prevent further queueing of inode drop events */ + spin_lock(&dentry_list_lock); + ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); + spin_unlock(&dentry_list_lock); + /* Wait for work to finish and/or remove it */ + cancel_work_sync(&osb->dentry_lock_work); + + kill_block_super(sb); +} + static struct file_system_type ocfs2_fs_type = { .owner = THIS_MODULE, .name = "ocfs2", .get_sb = ocfs2_get_sb, /* is this called when we mount * the fs? */ - .kill_sb = kill_block_super, /* set to the generic one - * right now, but do we - * need to change that? */ + .kill_sb = ocfs2_kill_sb, + .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, .next = NULL }; @@ -1819,6 +1832,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) debugfs_remove(osb->osb_ctxt); + /* + * Flush inode dropping work queue so that deletes are + * performed while the filesystem is still working + */ + ocfs2_drop_all_dl_inodes(osb); + /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); From 82e12644cf5227dab15201fbcaf0ca6330ebd70f Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 23 Jul 2009 08:12:58 +0800 Subject: [PATCH 10/60] ocfs2: Use ocfs2_rec_clusters in ocfs2_adjust_adjacent_records. In ocfs2_adjust_adjacent_records, we will adjust adjacent records according to the extent_list in the lower level. But actually the lower level tree will either be a leaf or a branch. If we only use ocfs2_is_empty_extent we will meet with some problem if the lower tree is a branch (tree_depth > 1). So use !ocfs2_rec_clusters instead. And actually only the leaf record can have holes. So add a BUG_ON for non-leaf branch. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 11085af7124..f9a3e894266 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1914,7 +1914,8 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, * immediately to their right. */ left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); - if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) { + if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) { + BUG_ON(right_child_el->l_tree_depth); BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); } From b57ac2c43e66cb4aa76c9d2d0e9e0e04f19cc5a6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:15 +0200 Subject: [PATCH 11/60] ocfs2: Make global quota files blocksize aligned Change i_size of global quota files so that it always remains aligned to block size. This is mainly because the end of quota block may contain checksum (if checksumming is enabled) and it's a bit awkward for it to be "outside" of quota file (and it makes life harder for ocfs2-tools). Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/quota_global.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index edfa60cd155..a66cb82fd6e 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -211,14 +211,17 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA); if (gqinode->i_size < off + len) { + loff_t rounded_end = + ocfs2_align_bytes_to_blocks(sb, off + len); + down_write(&OCFS2_I(gqinode)->ip_alloc_sem); - err = ocfs2_extend_no_holes(gqinode, off + len, off); + err = ocfs2_extend_no_holes(gqinode, rounded_end, off); up_write(&OCFS2_I(gqinode)->ip_alloc_sem); if (err < 0) goto out; err = ocfs2_simple_size_update(gqinode, oinfo->dqi_gqi_bh, - off + len); + rounded_end); if (err < 0) goto out; new = 1; From 4b3fa1904c0d192461ebba692e4940a334b74353 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:16 +0200 Subject: [PATCH 12/60] ocfs2: Mark buffer uptodate before calling ocfs2_journal_access_dq() In a code path extending local quota files we marked new header buffer uptodate only after calling ocfs2_journal_access_dq() which triggers a bug. Fix it and also call ocfs2 variant of the function marking buffer uptodate. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/quota_local.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 5a460fa8255..b624f9bc928 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -20,6 +20,7 @@ #include "sysfile.h" #include "dlmglue.h" #include "quota.h" +#include "uptodate.h" /* Number of local quota structures per block */ static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) @@ -979,6 +980,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( mlog_errno(status); goto out; } + ocfs2_set_new_buffer_uptodate(lqinode, bh); + dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; handle = ocfs2_start_trans(OCFS2_SB(sb), 2); @@ -999,7 +1002,6 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( memset(dchunk->dqc_bitmap, 0, sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - OCFS2_QBLK_RESERVED_SPACE); - set_buffer_uptodate(bh); unlock_buffer(bh); status = ocfs2_journal_dirty(handle, bh); if (status < 0) { From 0e7f387bf386c99e8ee322649fe4fc23b9cccc6c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:17 +0200 Subject: [PATCH 13/60] ocfs2: Initialize blocks allocated to local quota file When we extend local quota file, we should initialize data in newly allocated block. Firstly because on recovery we could parse bogus data, secondly so that block checksums are properly computed. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/quota_local.c | 114 ++++++++++++++++++++++++++++++++--------- 1 file changed, 91 insertions(+), 23 deletions(-) diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index b624f9bc928..9d2993de108 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -941,7 +941,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( struct ocfs2_local_disk_chunk *dchunk; int status; handle_t *handle; - struct buffer_head *bh = NULL; + struct buffer_head *bh = NULL, *dbh = NULL; u64 p_blkno; /* We are protected by dqio_sem so no locking needed */ @@ -965,34 +965,32 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( mlog_errno(status); goto out; } - - down_read(&OCFS2_I(lqinode)->ip_alloc_sem); - status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, - &p_blkno, NULL, NULL); - up_read(&OCFS2_I(lqinode)->ip_alloc_sem); - if (status < 0) { - mlog_errno(status); - goto out; - } - bh = sb_getblk(sb, p_blkno); - if (!bh) { - status = -ENOMEM; - mlog_errno(status); - goto out; - } - ocfs2_set_new_buffer_uptodate(lqinode, bh); - - dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; - - handle = ocfs2_start_trans(OCFS2_SB(sb), 2); + handle = ocfs2_start_trans(OCFS2_SB(sb), 3); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto out; } + /* Initialize chunk header */ + down_read(&OCFS2_I(lqinode)->ip_alloc_sem); + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, + &p_blkno, NULL, NULL); + up_read(&OCFS2_I(lqinode)->ip_alloc_sem); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + bh = sb_getblk(sb, p_blkno); + if (!bh) { + status = -ENOMEM; + mlog_errno(status); + goto out_trans; + } + dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; + ocfs2_set_new_buffer_uptodate(lqinode, bh); status = ocfs2_journal_access_dq(handle, lqinode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto out_trans; @@ -1009,6 +1007,38 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( goto out_trans; } + /* Initialize new block with structures */ + down_read(&OCFS2_I(lqinode)->ip_alloc_sem); + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1, + &p_blkno, NULL, NULL); + up_read(&OCFS2_I(lqinode)->ip_alloc_sem); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + dbh = sb_getblk(sb, p_blkno); + if (!dbh) { + status = -ENOMEM; + mlog_errno(status); + goto out_trans; + } + ocfs2_set_new_buffer_uptodate(lqinode, dbh); + status = ocfs2_journal_access_dq(handle, lqinode, dbh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(dbh); + memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE); + unlock_buffer(dbh); + status = ocfs2_journal_dirty(handle, dbh); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + + /* Update local quotafile info */ oinfo->dqi_blocks += 2; oinfo->dqi_chunks++; status = ocfs2_local_write_info(sb, type); @@ -1033,6 +1063,7 @@ out_trans: ocfs2_commit_trans(OCFS2_SB(sb), handle); out: brelse(bh); + brelse(dbh); kmem_cache_free(ocfs2_qf_chunk_cachep, chunk); return ERR_PTR(status); } @@ -1050,6 +1081,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( struct ocfs2_local_disk_chunk *dchunk; int epb = ol_quota_entries_per_block(sb); unsigned int chunk_blocks; + struct buffer_head *bh; + u64 p_blkno; int status; handle_t *handle; @@ -1077,12 +1110,46 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( mlog_errno(status); goto out; } - handle = ocfs2_start_trans(OCFS2_SB(sb), 2); + + /* Get buffer from the just added block */ + down_read(&OCFS2_I(lqinode)->ip_alloc_sem); + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, + &p_blkno, NULL, NULL); + up_read(&OCFS2_I(lqinode)->ip_alloc_sem); + if (status < 0) { + mlog_errno(status); + goto out; + } + bh = sb_getblk(sb, p_blkno); + if (!bh) { + status = -ENOMEM; + mlog_errno(status); + goto out; + } + ocfs2_set_new_buffer_uptodate(lqinode, bh); + + handle = ocfs2_start_trans(OCFS2_SB(sb), 3); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto out; } + /* Zero created block */ + status = ocfs2_journal_access_dq(handle, lqinode, bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + unlock_buffer(bh); + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + /* Update chunk header */ status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { @@ -1099,6 +1166,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( mlog_errno(status); goto out_trans; } + /* Update file header */ oinfo->dqi_blocks++; status = ocfs2_local_write_info(sb, type); if (status < 0) { From 7669f54c55df225cbb2db939a6c9f111445ffc1c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:18 +0200 Subject: [PATCH 14/60] ocfs2: Zero out padding of on disk dquot structure Padding fields of on-disk dquot structure were not zeroed. Zero them so that it's easier to use them later. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/quota_global.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index a66cb82fd6e..7248db68135 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -69,6 +69,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot) d->dqb_curspace = cpu_to_le64(m->dqb_curspace); d->dqb_btime = cpu_to_le64(m->dqb_btime); d->dqb_itime = cpu_to_le64(m->dqb_itime); + d->dqb_pad1 = d->dqb_pad2 = 0; } static int ocfs2_global_is_id(void *dp, struct dquot *dquot) From 1c1d9793ff6720531c0125a28d321f283716e32f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:19 +0200 Subject: [PATCH 15/60] ocfs2: Fix initialization of blockcheck stats We just set blockcheck stats to zeros but we should also properly initialize the spinlock there. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f2893878f8a..b0ee0fdf799 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -777,6 +777,7 @@ static int ocfs2_sb_probe(struct super_block *sb, } di = (struct ocfs2_dinode *) (*bh)->b_data; memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats)); + spin_lock_init(&stats->b_lock); status = ocfs2_verify_volume(di, *bh, blksize, stats); if (status >= 0) goto bail; From 4539f1df25bcd0fdf0d8a5e2c92de6bece83c7a0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:20 +0200 Subject: [PATCH 16/60] ocfs2: Remove syncjiff field from quota info syncjiff is just a converted value of syncms. Some places which are updating syncms forgot to update syncjiff as well. Since the conversion is just a simple division / multiplication and it does not happen frequently, just remove the syncjiff field to avoid forgotten conversions. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/quota.h | 1 - fs/ocfs2/quota_global.c | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index 7365e2e0870..3fb96fcd4c8 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -50,7 +50,6 @@ struct ocfs2_mem_dqinfo { unsigned int dqi_chunks; /* Number of chunks in local quota file */ unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */ unsigned int dqi_syncms; /* How often should we sync with other nodes */ - unsigned int dqi_syncjiff; /* Precomputed dqi_syncms in jiffies */ struct list_head dqi_chunk; /* List of chunks */ struct inode *dqi_gqinode; /* Global quota file inode */ struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */ diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 7248db68135..dab45467b5f 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -346,7 +346,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type) info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms); - oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms); oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); @@ -356,7 +355,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type) oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi); INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn); queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, - oinfo->dqi_syncjiff); + msecs_to_jiffies(oinfo->dqi_syncms)); out_err: mlog_exit(status); @@ -611,7 +610,7 @@ static void qsync_work_fn(struct work_struct *work) dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, - oinfo->dqi_syncjiff); + msecs_to_jiffies(oinfo->dqi_syncms)); } /* From 0584974a77796581eb3a64b6c5005edac4a95128 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:21 +0200 Subject: [PATCH 17/60] ocfs2: Define credit counts for quota operations Numbers of needed credits for some quota operations were written as raw numbers. Create appropriate defines instead. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/journal.h | 15 +++++++++++---- fs/ocfs2/quota_global.c | 18 +++++++++++++----- fs/ocfs2/quota_local.c | 16 ++++++++++++---- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 81e8abcd246..4a4d3b55fd2 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -330,20 +330,27 @@ int ocfs2_journal_dirty(handle_t *handle, /* extended attribute block update */ #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 -/* global quotafile inode update, data block */ -#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) +/* Update of a single quota block */ +#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1 +/* global quotafile inode update, data block */ +#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \ + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) + +#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS /* * The two writes below can accidentally see global info dirty due * to set_info() quotactl so make them prepared for the writes. */ /* quota data block, global info */ /* Write to local quota file */ -#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1) +#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \ + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) /* global quota data block, local quota data block, global quota inode, * global quota info */ -#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3) +#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \ + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) static inline int ocfs2_quota_trans_credits(struct super_block *sb) { diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index dab45467b5f..cc8785cf8f6 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -648,10 +648,15 @@ int ocfs2_calc_qdel_credits(struct super_block *sb, int type) return 0; oinfo = sb_dqinfo(sb, type)->dqi_priv; - /* We modify tree, leaf block, global info, local chunk header, - * global and local inode */ - return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 + - 2 * OCFS2_INODE_UPDATE_CREDITS; + /* + * We modify tree, leaf block, global info, local chunk header, + * global and local inode; OCFS2_QINFO_WRITE_CREDITS already + * accounts for inode update + */ + return oinfo->dqi_gi.dqi_qtree_depth + + OCFS2_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + + OCFS2_INODE_UPDATE_CREDITS; } static int ocfs2_release_dquot(struct dquot *dquot) @@ -701,7 +706,10 @@ int ocfs2_calc_qinit_credits(struct super_block *sb, int type) * global file we can modify info, tree and leaf block */ return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) + ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) + - 3 + oinfo->dqi_gi.dqi_qtree_depth + 2; + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + + oinfo->dqi_gi.dqi_qtree_depth + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS; } static int ocfs2_acquire_dquot(struct dquot *dquot) diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 9d2993de108..bdb09cb6e1f 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -101,7 +101,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh, handle_t *handle; int status; - handle = ocfs2_start_trans(OCFS2_SB(sb), 1); + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -611,7 +612,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, goto out_bh; /* Mark quota file as clean if we are recovering quota file of * some other node. */ - handle = ocfs2_start_trans(osb, 1); + handle = ocfs2_start_trans(osb, + OCFS2_LOCAL_QINFO_WRITE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -965,7 +967,10 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( mlog_errno(status); goto out; } - handle = ocfs2_start_trans(OCFS2_SB(sb), 3); + /* Local quota info and two new blocks we initialize */ + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -1128,7 +1133,10 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( } ocfs2_set_new_buffer_uptodate(lqinode, bh); - handle = ocfs2_start_trans(OCFS2_SB(sb), 3); + /* Local quota info, chunk header and the new block we initialize */ + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); From e9956fae7dbce6ac770e7a00436c496ddbbd3215 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 30 Jul 2009 16:07:10 +0800 Subject: [PATCH 18/60] ocfs2/quota: Release lock for error in ocfs2_quota_write. ocfs2_quota_write needs to release the lock if it fails to read quota block. So use "goto out" instead of "return err". Signed-off-by: Tao Ma Acked-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/quota_global.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index cc8785cf8f6..d604a6aa0a2 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -238,7 +238,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, } if (err) { mlog_errno(err); - return err; + goto out; } lock_buffer(bh); if (new) From ab57a40827d99e2d8e59066a56b93bf6c844c916 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Fri, 31 Jul 2009 14:28:02 -0500 Subject: [PATCH 19/60] ocfs2: Remove redundant BUG_ON in __dlm_queue_ast() We BUG_ON() the same thing twice. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Joel Becker --- fs/ocfs2/dlm/dlmast.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index d07ddbe4b28..81eff8e5832 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -103,7 +103,6 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) lock->ast_pending, lock->ml.type); BUG(); } - BUG_ON(!list_empty(&lock->ast_list)); if (lock->ast_pending) mlog(0, "lock has an ast getting flushed right now\n"); From e7432675f8ca868a4af365759a8d4c3779a3d922 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 6 Aug 2009 16:12:58 -0700 Subject: [PATCH 20/60] ocfs2: Initialize the cluster we're writing to in a non-sparse extend In a non-sparse extend, we correctly allocate (and zero) the clusters between the old_i_size and pos, but we don't zero the portions of the cluster we're writing to outside of pos<->len. It handles clustersize > pagesize and blocksize < pagesize. [Cleaned up by Joel Becker.] Signed-off-by: Sunil Mushran Signed-off-by: Joel Becker --- fs/ocfs2/aops.c | 66 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 19 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index e511df10145..b401654011a 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -895,18 +895,17 @@ struct ocfs2_write_cluster_desc { */ unsigned c_new; unsigned c_unwritten; + unsigned c_needs_zero; }; -static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d) -{ - return d->c_new || d->c_unwritten; -} - struct ocfs2_write_ctxt { /* Logical cluster position / len of write */ u32 w_cpos; u32 w_clen; + /* First cluster allocated in a nonsparse extend */ + u32 w_first_new_cpos; + struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; /* @@ -984,6 +983,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, return -ENOMEM; wc->w_cpos = pos >> osb->s_clustersize_bits; + wc->w_first_new_cpos = UINT_MAX; cend = (pos + len - 1) >> osb->s_clustersize_bits; wc->w_clen = cend - wc->w_cpos + 1; get_bh(di_bh); @@ -1218,20 +1218,18 @@ out: */ static int ocfs2_write_cluster(struct address_space *mapping, u32 phys, unsigned int unwritten, + unsigned int should_zero, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, struct ocfs2_write_ctxt *wc, u32 cpos, loff_t user_pos, unsigned user_len) { - int ret, i, new, should_zero = 0; + int ret, i, new; u64 v_blkno, p_blkno; struct inode *inode = mapping->host; struct ocfs2_extent_tree et; new = phys == 0 ? 1 : 0; - if (new || unwritten) - should_zero = 1; - if (new) { u32 tmp_pos; @@ -1342,7 +1340,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping, local_len = osb->s_clustersize - cluster_off; ret = ocfs2_write_cluster(mapping, desc->c_phys, - desc->c_unwritten, data_ac, meta_ac, + desc->c_unwritten, + desc->c_needs_zero, + data_ac, meta_ac, wc, desc->c_cpos, pos, local_len); if (ret) { mlog_errno(ret); @@ -1392,14 +1392,14 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, * newly allocated cluster. */ desc = &wc->w_desc[0]; - if (ocfs2_should_zero_cluster(desc)) + if (desc->c_needs_zero) ocfs2_figure_cluster_boundaries(osb, desc->c_cpos, &wc->w_target_from, NULL); desc = &wc->w_desc[wc->w_clen - 1]; - if (ocfs2_should_zero_cluster(desc)) + if (desc->c_needs_zero) ocfs2_figure_cluster_boundaries(osb, desc->c_cpos, NULL, @@ -1467,13 +1467,28 @@ static int ocfs2_populate_write_desc(struct inode *inode, phys++; } + /* + * If w_first_new_cpos is < UINT_MAX, we have a non-sparse + * file that got extended. w_first_new_cpos tells us + * where the newly allocated clusters are so we can + * zero them. + */ + if (desc->c_cpos >= wc->w_first_new_cpos) { + BUG_ON(phys == 0); + desc->c_needs_zero = 1; + } + desc->c_phys = phys; if (phys == 0) { desc->c_new = 1; + desc->c_needs_zero = 1; *clusters_to_alloc = *clusters_to_alloc + 1; } - if (ext_flags & OCFS2_EXT_UNWRITTEN) + + if (ext_flags & OCFS2_EXT_UNWRITTEN) { desc->c_unwritten = 1; + desc->c_needs_zero = 1; + } num_clusters--; } @@ -1633,10 +1648,13 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, if (newsize <= i_size_read(inode)) return 0; - ret = ocfs2_extend_no_holes(inode, newsize, newsize - len); + ret = ocfs2_extend_no_holes(inode, newsize, pos); if (ret) mlog_errno(ret); + wc->w_first_new_cpos = + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); + return ret; } @@ -1645,7 +1663,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page) { - int ret, credits = OCFS2_INODE_UPDATE_CREDITS; + int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; unsigned int clusters_to_alloc, extents_to_split; struct ocfs2_write_ctxt *wc; struct inode *inode = mapping->host; @@ -1723,8 +1741,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, } - ocfs2_set_target_boundaries(osb, wc, pos, len, - clusters_to_alloc + extents_to_split); + /* + * We have to zero sparse allocated clusters, unwritten extent clusters, + * and non-sparse clusters we just extended. For non-sparse writes, + * we know zeros will only be needed in the first and/or last cluster. + */ + if (clusters_to_alloc || extents_to_split || + wc->w_desc[0].c_needs_zero || + wc->w_desc[wc->w_clen - 1].c_needs_zero) + cluster_of_pages = 1; + else + cluster_of_pages = 0; + + ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { @@ -1757,8 +1786,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * extent. */ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, - clusters_to_alloc + extents_to_split, - mmap_page); + cluster_of_pages, mmap_page); if (ret) { mlog_errno(ret); goto out_quota; From 8a57a9dda760ea7845390f1cd36f3eb2a49391d8 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 6 Aug 2009 16:07:50 -0700 Subject: [PATCH 21/60] ocfs2: keep index within status_map[] Do not exceed array status_map[] Signed-off-by: Roel Kluin Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Joel Becker --- fs/ocfs2/stack_o2cb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 3f661376a2d..e49c4105026 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -17,6 +17,7 @@ * General Public License for more details. */ +#include #include #include @@ -153,7 +154,7 @@ static int status_map[] = { static int dlm_status_to_errno(enum dlm_status status) { - BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0]))); + BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map)); return status_map[status]; } From c8c00a6915a2e3d10416e8bdd3138429beb96210 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Aug 2009 12:50:52 +1000 Subject: [PATCH 22/60] Remove deadlock potential in md_open A recent commit: commit 449aad3e25358812c43afc60918c5ad3819488e7 introduced the possibility of an A-B/B-A deadlock between bd_mutex and reconfig_mutex. __blkdev_get holds bd_mutex while calling md_open which takes reconfig_mutex, do_md_run is always called with reconfig_mutex held, and it now takes bd_mutex in the call the revalidate_disk. This potential deadlock was not caught by lockdep due to the use of mutex_lock_interruptible_nexted which was introduced by commit d63a5a74dee87883fda6b7d170244acaac5b05e8 do avoid a warning of an impossible deadlock. It is quite possible to split reconfig_mutex in to two locks. One protects the array data structures while it is being reconfigured, the other ensures that an array is never even partially open while it is being deactivated. In particular, the second lock prevents an open from completing between the time when do_md_stop checks if there are any active opens, and the time when the array is either set read-only, or when ->pers is set to NULL. So we can be certain that no IO is in flight as the array is being destroyed. So create a new lock, open_mutex, just to ensure exclusion between 'open' and 'stop'. This avoids the deadlock and also avoids the lockdep warning mentioned in commit d63a5a74d Reported-by: "Mike Snitzer" Reported-by: "H. Peter Anvin" Signed-off-by: NeilBrown --- drivers/md/md.c | 18 ++++++++++-------- drivers/md/md.h | 10 ++++++++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 5b98bea4ff9..5614500092e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -359,6 +359,7 @@ static mddev_t * mddev_find(dev_t unit) else new->md_minor = MINOR(unit) >> MdpMinorShift; + mutex_init(&new->open_mutex); mutex_init(&new->reconfig_mutex); INIT_LIST_HEAD(&new->disks); INIT_LIST_HEAD(&new->all_mddevs); @@ -4304,12 +4305,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) struct gendisk *disk = mddev->gendisk; mdk_rdev_t *rdev; + mutex_lock(&mddev->open_mutex); if (atomic_read(&mddev->openers) > is_open) { printk("md: %s still in use.\n",mdname(mddev)); - return -EBUSY; - } - - if (mddev->pers) { + err = -EBUSY; + } else if (mddev->pers) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); @@ -4367,7 +4367,10 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) set_disk_ro(disk, 1); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } - +out: + mutex_unlock(&mddev->open_mutex); + if (err) + return err; /* * Free resources if final stop */ @@ -4433,7 +4436,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) blk_integrity_unregister(disk); md_new_event(mddev); sysfs_notify_dirent(mddev->sysfs_state); -out: return err; } @@ -5518,12 +5520,12 @@ static int md_open(struct block_device *bdev, fmode_t mode) } BUG_ON(mddev != bdev->bd_disk->private_data); - if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) + if ((err = mutex_lock_interruptible(&mddev->open_mutex))) goto out; err = 0; atomic_inc(&mddev->openers); - mddev_unlock(mddev); + mutex_unlock(&mddev->open_mutex); check_disk_change(bdev); out: diff --git a/drivers/md/md.h b/drivers/md/md.h index 78f03168baf..f8fc188bc76 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -223,6 +223,16 @@ struct mddev_s * so we don't loop trying */ int in_sync; /* know to not need resync */ + /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so + * that we are never stopping an array while it is open. + * 'reconfig_mutex' protects all other reconfiguration. + * These locks are separate due to conflicting interactions + * with bdev->bd_mutex. + * Lock ordering is: + * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk + * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open + */ + struct mutex open_mutex; struct mutex reconfig_mutex; atomic_t active; /* general refcount */ atomic_t openers; /* number of active opens */ From beda2c7ea2c15ed01eef00a997d2b0496c3a502d Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Sun, 9 Aug 2009 15:34:39 -0700 Subject: [PATCH 23/60] futex: Update futex_q lock_ptr on requeue proxy lock futex_requeue() can acquire the lock on behalf of a waiter early on or during the requeue loop if it is uncontended or in the event of a lock steal or owner died. On wakeup, the waiter (in futex_wait_requeue_pi()) cleans up the pi_state owner using the lock_ptr to protect against concurrent access to the pi_state. The pi_state is hung off futex_q's on the requeue target futex hash bucket so the lock_ptr needs to be updated accordingly. The problem manifested by triggering the WARN_ON in lookup_pi_state() about the pid != pi_state->owner->pid. With this patch, the pi_state is properly guarded against concurrent access via the requeue target hb lock. The astute reviewer may notice that there is a window of time between when futex_requeue() unlocks the hb locks and when futex_wait_requeue_pi() will acquire hb2->lock. During this time the pi_state and uval are not in sync with the underlying rtmutex owner (but the uval does indicate there are waiters, so no atomic changes will occur in userspace). However, this is not a problem. Should a contending thread enter lookup_pi_state() and acquire hb2->lock before the ownership is fixed up, it will find the pi_state hung off a waiter's (possibly the pending owner's) futex_q and block on the rtmutex. Once futex_wait_requeue_pi() fixes up the owner, it will also move the pi_state from the old owner's task->pi_state_list to its own. v3: Fix plist lock name for application to mainline (rather than -rt) Compile tested against tip/v2.6.31-rc5. Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A7F4EFF.6090903@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 0672ff88f15..8cc3ee1363a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1010,15 +1010,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue * q: the futex_q * key: the key of the requeue target futex + * hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the * target futex if it is uncontended or via a lock steal. Set the futex_q key * to the requeue target futex so the waiter can detect the wakeup on the right * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition. Must be called with the q->lock_ptr held. + * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock + * to protect access to the pi_state to fixup the owner later. Must be called + * with both q->lock_ptr and hb->lock held. */ static inline -void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, + struct futex_hash_bucket *hb) { drop_futex_key_refs(&q->key); get_futex_key_refs(key); @@ -1030,6 +1034,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; + q->lock_ptr = &hb->lock; +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.lock = &hb->lock; +#endif + wake_up_state(q->task, TASK_NORMAL); } @@ -1088,7 +1097,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, set_waiters); if (ret == 1) - requeue_pi_wake_futex(top_waiter, key2); + requeue_pi_wake_futex(top_waiter, key2, hb2); return ret; } @@ -1273,7 +1282,7 @@ retry_private: this->task, 1); if (ret == 1) { /* We got the lock. */ - requeue_pi_wake_futex(this, &key2); + requeue_pi_wake_futex(this, &key2, hb2); continue; } else if (ret) { /* -EDEADLK */ From 2fc391112fb6f3424435a3aa2fda887497b5f807 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 12:33:05 +0100 Subject: [PATCH 24/60] locking, sched: Give waitqueue spinlocks their own lockdep classes Give waitqueue spinlocks their own lockdep classes when they are initialised from init_waitqueue_head(). This means that struct wait_queue::func functions can operate other waitqueues. This is used by CacheFiles to catch the page from a backing fs being unlocked and to wake up another thread to take a copy of it. Signed-off-by: Peter Zijlstra Signed-off-by: David Howells Tested-by: Takashi Iwai Cc: linux-cachefs@redhat.com Cc: torvalds@osdl.org Cc: akpm@linux-foundation.org LKML-Reference: <20090810113305.17284.81508.stgit@warthog.procyon.org.uk> Signed-off-by: Ingo Molnar --- include/linux/wait.h | 9 ++++++++- kernel/wait.c | 5 +++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index 6788e1a4d4c..cf3c2f5dba5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -77,7 +77,14 @@ struct task_struct; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } -extern void init_waitqueue_head(wait_queue_head_t *q); +extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *); + +#define init_waitqueue_head(q) \ + do { \ + static struct lock_class_key __key; \ + \ + __init_waitqueue_head((q), &__key); \ + } while (0) #ifdef CONFIG_LOCKDEP # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ diff --git a/kernel/wait.c b/kernel/wait.c index ea7c3b4275c..c4bd3d825f3 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -10,13 +10,14 @@ #include #include -void init_waitqueue_head(wait_queue_head_t *q) +void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) { spin_lock_init(&q->lock); + lockdep_set_class(&q->lock, key); INIT_LIST_HEAD(&q->task_list); } -EXPORT_SYMBOL(init_waitqueue_head); +EXPORT_SYMBOL(__init_waitqueue_head); void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) { From 4dc88029fd916b860ef063c40180aa604ce93494 Mon Sep 17 00:00:00 2001 From: Dinakar Guniguntala Date: Mon, 10 Aug 2009 18:31:42 +0530 Subject: [PATCH 25/60] futex: Fix compat_futex to be same as futex for REQUEUE_PI Need to add the REQUEUE_PI checks to the compat_sys_futex API as well to ensure 32 bit requeue's work fine on a 64 bit system. Patch is against latest tip Signed-off-by: Dinakar Guniguntala Cc: Darren Hart LKML-Reference: <20090810130142.GA23619@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex_compat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d607a5b9ee2..235716556bf 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, int cmd = op & FUTEX_CMD_MASK; if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET)) { + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { if (get_compat_timespec(&ts, utime)) return -EFAULT; if (!timespec_valid(&ts)) @@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, t = ktime_add_safe(ktime_get(), t); tp = &t; } - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (int) (unsigned long) utime; return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); From 392741e0a4e17c82e3978b7fcbf04291294dc0a1 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 7 Aug 2009 15:20:48 -0700 Subject: [PATCH 26/60] futex: Fix handling of bad requeue syscall pairing If futex_requeue(requeue_pi=1) finds a futex_q that was created by a call other the futex_wait_requeue_pi(), the q.rt_waiter may be null. If so, this will result in an oops from the following call graph: futex_requeue() rt_mutex_start_proxy_lock() task_blocks_on_rt_mutex() waiter->task dereference OOPS We currently WARN_ON() if this is detected, clearly this is inadequate. If we detect a mispairing in futex_requeue(), bail out, seding -EINVAL to user-space. V2: Fix parenthesis warnings. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Steven Rostedt Cc: John Kacur Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A7CA8C0.7010809@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 8cc3ee1363a..e18cfbdc719 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1256,8 +1256,15 @@ retry_private: if (!match_futex(&this->key, &key1)) continue; - WARN_ON(!requeue_pi && this->rt_waiter); - WARN_ON(requeue_pi && !this->rt_waiter); + /* + * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * be paired with each other and no other futex ops. + */ + if ((requeue_pi && !this->rt_waiter) || + (!requeue_pi && this->rt_waiter)) { + ret = -EINVAL; + break; + } /* * Wake nr_wake waiters. For requeue_pi, if we acquired the From 3e03bbeac541856aaaf1ce1ab0250b6a490e4099 Mon Sep 17 00:00:00 2001 From: Shunichi Fuji Date: Tue, 11 Aug 2009 03:34:40 +0900 Subject: [PATCH 27/60] x86: Add reboot quirk for every 5 series MacBook/Pro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reboot does not work on my MacBook Pro 13 inch (MacBookPro5,5) too. It seems all unibody MacBook and MacBookPro require PCI reboot handling, i guess. Following model/machine ID list shows unibody MacBook/Pro have the 5 series of model number: http://www.everymac.com/systems/by_capability/macs-by-machine-model-machine-id.html Signed-off-by: Shunichi Fuji Cc: Ozan Çağlayan LKML-Reference: <30046e3b0908101134p6487ddbftd8776e4ddef204be@mail.gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9eb89760370..a06e8d10184 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -418,20 +418,20 @@ static int __init set_pci_reboot(const struct dmi_system_id *d) } static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { - { /* Handle problems with rebooting on Apple MacBook5,2 */ + { /* Handle problems with rebooting on Apple MacBook5 */ .callback = set_pci_reboot, - .ident = "Apple MacBook", + .ident = "Apple MacBook5", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5,2"), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), }, }, - { /* Handle problems with rebooting on Apple MacBookPro5,1 */ + { /* Handle problems with rebooting on Apple MacBookPro5 */ .callback = set_pci_reboot, - .ident = "Apple MacBookPro5,1", + .ident = "Apple MacBookPro5", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5,1"), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), }, }, { } From b409d7a0ab46fe530efe52734984b4ed5d46c3eb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 6 Aug 2009 23:29:34 +0200 Subject: [PATCH 28/60] ocfs2: Fix possible deadlock when extending quota file In OCFS2, allocator locks rank above transaction start. Thus we cannot extend quota file from inside a transaction less we could deadlock. We solve the problem by starting transaction not already in ocfs2_acquire_dquot() but only in ocfs2_local_read_dquot() and ocfs2_global_read_dquot() and we allocate blocks to quota files before starting the transaction. In case we crash, quota files will just have a few blocks more but that's no problem since we just use them next time we extend the quota file. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/journal.h | 5 -- fs/ocfs2/quota_global.c | 115 ++++++++++++++++++++-------------------- 2 files changed, 57 insertions(+), 63 deletions(-) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 4a4d3b55fd2..2c3222aec62 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -363,11 +363,6 @@ static inline int ocfs2_quota_trans_credits(struct super_block *sb) return credits; } -/* Number of credits needed for removing quota structure from file */ -int ocfs2_calc_qdel_credits(struct super_block *sb, int type); -/* Number of credits needed for initialization of new quota structure */ -int ocfs2_calc_qinit_credits(struct super_block *sb, int type); - /* group extend. inode update and last group update. */ #define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index d604a6aa0a2..bf7742d0ee3 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -215,11 +215,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, loff_t rounded_end = ocfs2_align_bytes_to_blocks(sb, off + len); - down_write(&OCFS2_I(gqinode)->ip_alloc_sem); - err = ocfs2_extend_no_holes(gqinode, rounded_end, off); - up_write(&OCFS2_I(gqinode)->ip_alloc_sem); - if (err < 0) - goto out; + /* Space is already allocated in ocfs2_global_read_dquot() */ err = ocfs2_simple_size_update(gqinode, oinfo->dqi_gqi_bh, rounded_end); @@ -405,13 +401,36 @@ int ocfs2_global_write_info(struct super_block *sb, int type) return err; } +static int ocfs2_global_qinit_alloc(struct super_block *sb, int type) +{ + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + + /* + * We may need to allocate tree blocks and a leaf block but not the + * root block + */ + return oinfo->dqi_gi.dqi_qtree_depth; +} + +static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type) +{ + /* We modify all the allocated blocks, tree root, and info block */ + return (ocfs2_global_qinit_alloc(sb, type) + 2) * + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS; +} + /* Read in information from global quota file and acquire a reference to it. * dquot_acquire() has already started the transaction and locked quota file */ int ocfs2_global_read_dquot(struct dquot *dquot) { int err, err2, ex = 0; - struct ocfs2_mem_dqinfo *info = - sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_type; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_super *osb = OCFS2_SB(sb); + struct inode *gqinode = info->dqi_gqinode; + int need_alloc = ocfs2_global_qinit_alloc(sb, type); + handle_t *handle = NULL; err = ocfs2_qinfo_lock(info, 0); if (err < 0) @@ -422,14 +441,33 @@ int ocfs2_global_read_dquot(struct dquot *dquot) OCFS2_DQUOT(dquot)->dq_use_count++; OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; + ocfs2_qinfo_unlock(info, 0); + if (!dquot->dq_off) { /* No real quota entry? */ - /* Upgrade to exclusive lock for allocation */ - ocfs2_qinfo_unlock(info, 0); - err = ocfs2_qinfo_lock(info, 1); - if (err < 0) - goto out_qlock; ex = 1; + /* + * Add blocks to quota file before we start a transaction since + * locking allocators ranks above a transaction start + */ + WARN_ON(journal_current_handle()); + down_write(&OCFS2_I(gqinode)->ip_alloc_sem); + err = ocfs2_extend_no_holes(gqinode, + gqinode->i_size + (need_alloc << sb->s_blocksize_bits), + gqinode->i_size); + up_write(&OCFS2_I(gqinode)->ip_alloc_sem); + if (err < 0) + goto out; } + + handle = ocfs2_start_trans(osb, + ocfs2_calc_global_qinit_credits(sb, type)); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out; + } + err = ocfs2_qinfo_lock(info, ex); + if (err < 0) + goto out_trans; err = qtree_write_dquot(&info->dqi_gi, dquot); if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) { err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type); @@ -441,6 +479,9 @@ out_qlock: ocfs2_qinfo_unlock(info, 1); else ocfs2_qinfo_unlock(info, 0); +out_trans: + if (handle) + ocfs2_commit_trans(osb, handle); out: if (err < 0) mlog_errno(err); @@ -638,24 +679,17 @@ out: return status; } -int ocfs2_calc_qdel_credits(struct super_block *sb, int type) +static int ocfs2_calc_qdel_credits(struct super_block *sb, int type) { - struct ocfs2_mem_dqinfo *oinfo; - int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA }; - - if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type])) - return 0; - - oinfo = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; /* * We modify tree, leaf block, global info, local chunk header, * global and local inode; OCFS2_QINFO_WRITE_CREDITS already * accounts for inode update */ - return oinfo->dqi_gi.dqi_qtree_depth + + return (oinfo->dqi_gi.dqi_qtree_depth + 2) * + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + OCFS2_QINFO_WRITE_CREDITS + - 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + OCFS2_INODE_UPDATE_CREDITS; } @@ -688,36 +722,10 @@ out: return status; } -int ocfs2_calc_qinit_credits(struct super_block *sb, int type) -{ - struct ocfs2_mem_dqinfo *oinfo; - int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA }; - struct ocfs2_dinode *lfe, *gfe; - - if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type])) - return 0; - - oinfo = sb_dqinfo(sb, type)->dqi_priv; - gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data; - lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data; - /* We can extend local file + global file. In local file we - * can modify info, chunk header block and dquot block. In - * global file we can modify info, tree and leaf block */ - return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) + - ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) + - OCFS2_LOCAL_QINFO_WRITE_CREDITS + - 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + - oinfo->dqi_gi.dqi_qtree_depth + - 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS; -} - static int ocfs2_acquire_dquot(struct dquot *dquot) { - handle_t *handle; struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; - struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); int status = 0; mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); @@ -726,16 +734,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) status = ocfs2_lock_global_qf(oinfo, 1); if (status < 0) goto out; - handle = ocfs2_start_trans(osb, - ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type)); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out_ilock; - } status = dquot_acquire(dquot); - ocfs2_commit_trans(osb, handle); -out_ilock: ocfs2_unlock_global_qf(oinfo, 1); out: mlog_exit(status); From 0d01f31439c1e4d602bf9fdc924ab66f407f5e38 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 9 Aug 2009 21:44:49 -0700 Subject: [PATCH 29/60] x86, mce: therm_throt - change when we print messages My Latitude d630 seems to be handling thermal events in SMI by lowering the max frequency of the CPU till it cools down but still leaks the "everything is normal" events. This spams the console and with high priority printks. Adjust therm_throt driver to only print messages about the fact that temperatire returned back to normal when leaving the throttling state. Also lower the severity of "back to normal" message from KERN_CRIT to KERN_INFO. Signed-off-by: Dmitry Torokhov Acked-by: H. Peter Anvin LKML-Reference: <20090810051513.0558F526EC9@mailhub.coreip.homeip.net> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index bff8dd191dd..8bc64cfbe93 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -36,6 +36,7 @@ static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); +static DEFINE_PER_CPU(bool, thermal_throttle_active); static atomic_t therm_throt_en = ATOMIC_INIT(0); @@ -96,24 +97,27 @@ static int therm_throt_process(int curr) { unsigned int cpu = smp_processor_id(); __u64 tmp_jiffs = get_jiffies_64(); + bool was_throttled = __get_cpu_var(thermal_throttle_active); + bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; - if (curr) + if (is_throttled) __get_cpu_var(thermal_throttle_count)++; - if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) + if (!(was_throttled ^ is_throttled) && + time_before64(tmp_jiffs, __get_cpu_var(next_check))) return 0; __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; /* if we just entered the thermal event */ - if (curr) { + if (is_throttled) { printk(KERN_CRIT "CPU%d: Temperature above threshold, " - "cpu clock throttled (total events = %lu)\n", cpu, - __get_cpu_var(thermal_throttle_count)); + "cpu clock throttled (total events = %lu)\n", + cpu, __get_cpu_var(thermal_throttle_count)); add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); + } else if (was_throttled) { + printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); } return 1; From 3c581a7f94542341bf0da496a226b44ac63521a8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Aug 2009 10:47:36 +0200 Subject: [PATCH 30/60] perf_counter, x86: Fix lapic printk message Instead of this garbled bootup on UP Pentium-M systems: [ 0.015048] Performance Counters: [ 0.016004] no Local APIC, try rebooting with lapicno PMU driver, software counters only. Print: [ 0.015050] Performance Counters: [ 0.016004] no APIC, boot with the "lapic" boot parameter to force-enable it. [ 0.017003] no PMU driver, software counters only. Cf: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a7aa8f90095..40e233a24d9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1590,7 +1590,7 @@ static int p6_pmu_init(void) } if (!cpu_has_apic) { - pr_info("no Local APIC, try rebooting with lapic"); + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); return -ENODEV; } From f64ccccb8afa43abdd63fcbd230f818d6ea0883f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Aug 2009 10:26:33 +0200 Subject: [PATCH 31/60] perf_counter, x86: Fix generic cache events on P6-mobile CPUs Johannes Stezenbach reported that 'perf stat' does not count cache-miss and cache-references events on his Pentium-M based laptop. This is because we left them blank in p6_perfmon_event_map[], fill them in. Reported-by: Johannes Stezenbach Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 40e233a24d9..fffc126dbdf 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -72,8 +72,8 @@ static const u64 p6_perfmon_event_map[] = { [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0000, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0000, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, From fbd8b1819e80ac5a176d085fdddc3a34d1499318 Mon Sep 17 00:00:00 2001 From: Kevin Winchester Date: Mon, 10 Aug 2009 19:56:45 -0300 Subject: [PATCH 32/60] x86: Clear incorrectly forced X86_FEATURE_LAHF_LM flag Due to an erratum with certain AMD Athlon 64 processors, the BIOS may need to force enable the LAHF_LM capability. Unfortunately, in at least one case, the BIOS does this even for processors that do not support the functionality. Add a specific check that will clear the feature bit for processors known not to support the LAHF/SAHF instructions. Signed-off-by: Kevin Winchester Acked-by: Borislav Petkov LKML-Reference: <4A80A5AD.2000209@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e2485b03f1c..63fddcd082c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -400,6 +400,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) level = cpuid_eax(1); if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* + * Some BIOSes incorrectly force this feature, but only K8 + * revision D (model = 0x14) and later actually support it. + */ + if (c->x86_model < 0x14) + clear_cpu_cap(c, X86_FEATURE_LAHF_LM); } if (c->x86 == 0x10 || c->x86 == 0x11) set_cpu_cap(c, X86_FEATURE_REP_GOOD); From e8055139d996e85722984968472868d6dccb1490 Mon Sep 17 00:00:00 2001 From: Ondrej Zary Date: Tue, 11 Aug 2009 20:00:11 +0200 Subject: [PATCH 33/60] x86: Fix oops in identify_cpu() on CPUs without CPUID Kernel is broken for x86 CPUs without CPUID since 2.6.28. It crashes with NULL pointer dereference in identify_cpu(): 766 generic_identify(c); 767 768--> if (this_cpu->c_identify) 769 this_cpu->c_identify(c); this_cpu is NULL. This is because it's only initialized in get_cpu_vendor() function, which is not called if the CPU has no CPUID instruction. Signed-off-by: Ondrej Zary LKML-Reference: <200908112000.15993.linux@rainbow-software.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 48 ++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f1961c07af9..5ce60a88027 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -59,7 +59,30 @@ void __init setup_cpu_local_masks(void) alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); } -static const struct cpu_dev *this_cpu __cpuinitdata; +static void __cpuinit default_init(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + display_cacheinfo(c); +#else + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +#endif +} + +static const struct cpu_dev __cpuinitconst default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, +}; + +static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 @@ -332,29 +355,6 @@ void switch_to_new_gdt(int cpu) static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; -static void __cpuinit default_init(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_64 - display_cacheinfo(c); -#else - /* Not much we can do here... */ - /* Check if at least it has cpuid */ - if (c->cpuid_level == -1) { - /* No cpuid. It must be an ancient CPU */ - if (c->x86 == 4) - strcpy(c->x86_model_id, "486"); - else if (c->x86 == 3) - strcpy(c->x86_model_id, "386"); - } -#endif -} - -static const struct cpu_dev __cpuinitconst default_cpu = { - .c_init = default_init, - .c_vendor = "Unknown", - .c_x86_vendor = X86_VENDOR_UNKNOWN, -}; - static void __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; From 2a8083f063472f27c253545dd64e1a7bbbb1ab61 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Aug 2009 16:22:00 -0300 Subject: [PATCH 34/60] perf record: Fix .tid and .pid fill-in when synthesizing events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Noticed when trying to record events for a firefox thread. We were synthesizing both .tid and .pid with the pid passed via --pid. Fix it by reading /proc/PID/status and getting the tgid to use in .pid, .tid gets the specified "pid". Signed-off-by: Arnaldo Carvalho de Melo Cc: "H. Peter Anvin" Cc: Frédéric Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20090811192200.GF18061@ghostprotocols.net> Signed-off-by: Ingo Molnar --- tools/perf/builtin-record.c | 69 ++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 0345aad8eba..30b83def03d 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -203,46 +203,48 @@ static void sig_atexit(void) kill(getpid(), signr); } -static void pid_synthesize_comm_event(pid_t pid, int full) +static pid_t pid_synthesize_comm_event(pid_t pid, int full) { struct comm_event comm_ev; char filename[PATH_MAX]; char bf[BUFSIZ]; - int fd; - size_t size; - char *field, *sep; + FILE *fp; + size_t size = 0; DIR *tasks; struct dirent dirent, *next; + pid_t tgid = 0; - snprintf(filename, sizeof(filename), "/proc/%d/stat", pid); + snprintf(filename, sizeof(filename), "/proc/%d/status", pid); - fd = open(filename, O_RDONLY); - if (fd < 0) { + fp = fopen(filename, "r"); + if (fd == NULL) { /* * We raced with a task exiting - just return: */ if (verbose) fprintf(stderr, "couldn't open %s\n", filename); - return; + return 0; } - if (read(fd, bf, sizeof(bf)) < 0) { - fprintf(stderr, "couldn't read %s\n", filename); - exit(EXIT_FAILURE); - } - close(fd); - /* 9027 (cat) R 6747 9027 6747 34816 9027 ... */ memset(&comm_ev, 0, sizeof(comm_ev)); - field = strchr(bf, '('); - if (field == NULL) - goto out_failure; - sep = strchr(++field, ')'); - if (sep == NULL) - goto out_failure; - size = sep - field; - memcpy(comm_ev.comm, field, size++); + while (!comm_ev.comm[0] || !comm_ev.pid) { + if (fgets(bf, sizeof(bf), fp) == NULL) + goto out_failure; + + if (memcmp(bf, "Name:", 5) == 0) { + char *name = bf + 5; + while (*name && isspace(*name)) + ++name; + size = strlen(name) - 1; + memcpy(comm_ev.comm, name, size++); + } else if (memcmp(bf, "Tgid:", 5) == 0) { + char *tgids = bf + 5; + while (*tgids && isspace(*tgids)) + ++tgids; + tgid = comm_ev.pid = atoi(tgids); + } + } - comm_ev.pid = pid; comm_ev.header.type = PERF_EVENT_COMM; size = ALIGN(size, sizeof(u64)); comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size); @@ -251,7 +253,7 @@ static void pid_synthesize_comm_event(pid_t pid, int full) comm_ev.tid = pid; write_output(&comm_ev, comm_ev.header.size); - return; + goto out_fclose; } snprintf(filename, sizeof(filename), "/proc/%d/task", pid); @@ -268,7 +270,10 @@ static void pid_synthesize_comm_event(pid_t pid, int full) write_output(&comm_ev, comm_ev.header.size); } closedir(tasks); - return; + +out_fclose: + fclose(fp); + return tgid; out_failure: fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n", @@ -276,7 +281,7 @@ out_failure: exit(EXIT_FAILURE); } -static void pid_synthesize_mmap_samples(pid_t pid) +static void pid_synthesize_mmap_samples(pid_t pid, pid_t tgid) { char filename[PATH_MAX]; FILE *fp; @@ -328,7 +333,7 @@ static void pid_synthesize_mmap_samples(pid_t pid) mmap_ev.len -= mmap_ev.start; mmap_ev.header.size = (sizeof(mmap_ev) - (sizeof(mmap_ev.filename) - size)); - mmap_ev.pid = pid; + mmap_ev.pid = tgid; mmap_ev.tid = pid; write_output(&mmap_ev, mmap_ev.header.size); @@ -347,14 +352,14 @@ static void synthesize_all(void) while (!readdir_r(proc, &dirent, &next) && next) { char *end; - pid_t pid; + pid_t pid, tgid; pid = strtol(dirent.d_name, &end, 10); if (*end) /* only interested in proper numerical dirents */ continue; - pid_synthesize_comm_event(pid, 1); - pid_synthesize_mmap_samples(pid); + tgid = pid_synthesize_comm_event(pid, 1); + pid_synthesize_mmap_samples(pid, tgid); } closedir(proc); @@ -567,8 +572,8 @@ static int __cmd_record(int argc, const char **argv) perf_header__write(header, output); if (!system_wide) { - pid_synthesize_comm_event(pid, 0); - pid_synthesize_mmap_samples(pid); + pid_t tgid = pid_synthesize_comm_event(pid, 0); + pid_synthesize_mmap_samples(pid, tgid); } else synthesize_all(); From 94a24752fe95ca1e7f98b197052d44e6a207740d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Aug 2009 16:21:38 -0300 Subject: [PATCH 35/60] perf report: Show the tid too in -D MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This made it easier to find the firefox threading related bug. Signed-off-by: Arnaldo Carvalho de Melo Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Frédéric Weisbecker LKML-Reference: <20090811192138.GE18061@ghostprotocols.net> Signed-off-by: Ingo Molnar --- tools/perf/builtin-report.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 99274cec0ad..23e1457f240 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1526,11 +1526,11 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) more_data += sizeof(u64); } - dprintf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d: %p period: %Ld\n", + dprintf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", (void *)(offset + head), (void *)(long)(event->header.size), event->header.misc, - event->ip.pid, + event->ip.pid, event->ip.tid, (void *)(long)ip, (long long)period); @@ -1612,10 +1612,11 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head) struct thread *thread = threads__findnew(event->mmap.pid); struct map *map = map__new(&event->mmap); - dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n", + dprintf("%p [%p]: PERF_EVENT_MMAP %d/%d: [%p(%p) @ %p]: %s\n", (void *)(offset + head), (void *)(long)(event->header.size), event->mmap.pid, + event->mmap.tid, (void *)(long)event->mmap.start, (void *)(long)event->mmap.len, (void *)(long)event->mmap.pgoff, From 247648e3742ded01e42a4b14c2da330b13cbb47f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Aug 2009 16:22:11 -0300 Subject: [PATCH 36/60] perf tools: Fix fallback to cplus_demangle() when bfd_demangle() is not available MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In old binutils we can't access bfd_demangle(), use cplus_demangle() just like oprofile. Signed-off-by: Arnaldo Carvalho de Melo Cc: Luis Claudio R. Gonçalves Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Frédéric Weisbecker LKML-Reference: <20090811192211.GG18061@ghostprotocols.net> Signed-off-by: Ingo Molnar --- tools/perf/Makefile | 29 ++++++++++++++++++----------- tools/perf/util/symbol.c | 15 --------------- tools/perf/util/symbol.h | 24 ++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 26 deletions(-) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 60411e94113..c045b4271e5 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -382,22 +382,29 @@ endif ifdef NO_DEMANGLE BASIC_CFLAGS += -DNO_DEMANGLE else - has_bfd := $(shell sh -c "(echo '\#include '; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd > /dev/null 2>&1 && echo y") - has_bfd_iberty := $(shell sh -c "(echo '\#include '; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty > /dev/null 2>&1 && echo y") - - has_bfd_iberty_z := $(shell sh -c "(echo '\#include '; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty -lz > /dev/null 2>&1 && echo y") - ifeq ($(has_bfd),y) EXTLIBS += -lbfd - else ifeq ($(has_bfd_iberty),y) - EXTLIBS += -lbfd -liberty - else ifeq ($(has_bfd_iberty_z),y) - EXTLIBS += -lbfd -liberty -lz else - msg := $(warning No bfd.h/libbfd found, install binutils-dev[el] to gain symbol demangling) - BASIC_CFLAGS += -DNO_DEMANGLE + has_bfd_iberty := $(shell sh -c "(echo '\#include '; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty > /dev/null 2>&1 && echo y") + ifeq ($(has_bfd_iberty),y) + EXTLIBS += -lbfd -liberty + else + has_bfd_iberty_z := $(shell sh -c "(echo '\#include '; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty -lz > /dev/null 2>&1 && echo y") + ifeq ($(has_bfd_iberty_z),y) + EXTLIBS += -lbfd -liberty -lz + else + has_cplus_demangle := $(shell sh -c "(echo 'extern char *cplus_demangle(const char *, int);'; echo 'int main(void) { cplus_demangle(0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -liberty > /dev/null 2>&1 && echo y") + ifeq ($(has_cplus_demangle),y) + EXTLIBS += -liberty + BASIC_CFLAGS += -DHAVE_CPLUS_DEMANGLE + else + msg := $(warning No bfd.h/libbfd found, install binutils-dev[el] to gain symbol demangling) + BASIC_CFLAGS += -DNO_DEMANGLE + endif + endif + endif endif endif diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index f1dcede1430..2473fd427be 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -7,23 +7,8 @@ #include #include -#ifndef NO_DEMANGLE -#include -#else -static inline -char *bfd_demangle(void __used *v, const char __used *c, int __used i) -{ - return NULL; -} -#endif - const char *sym_hist_filter; -#ifndef DMGL_PARAMS -#define DMGL_PARAMS (1 << 0) /* Include function args */ -#define DMGL_ANSI (1 << 1) /* Include const, volatile, etc */ -#endif - enum dso_origin { DSO__ORIG_KERNEL = 0, DSO__ORIG_JAVA_JIT, diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 1e003ec2f4b..b53bf0125c1 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -7,6 +7,30 @@ #include #include "module.h" +#ifdef HAVE_CPLUS_DEMANGLE +extern char *cplus_demangle(const char *, int); + +static inline char *bfd_demangle(void __used *v, const char *c, int i) +{ + return cplus_demangle(c, i); +} +#else +#ifdef NO_DEMANGLE +static inline char *bfd_demangle(void __used *v, const char __used *c, + int __used i) +{ + return NULL; +} +#else +#include +#endif +#endif + +#ifndef DMGL_PARAMS +#define DMGL_PARAMS (1 << 0) /* Include function args */ +#define DMGL_ANSI (1 << 1) /* Include const, volatile, etc */ +#endif + struct symbol { struct rb_node rb_node; u64 start; From 1340e6bbaff7ff7f6f75eb4a5c34933efce84a84 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Aug 2009 17:04:36 -0300 Subject: [PATCH 37/60] perf tools: Fix dso__new handle() to handle deleted DSOs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is better than showing the map addr, this way at least we know that we can't get the symtabs because the DSO was deleted (system update) while an app still used such DSO. Yeah, don't do that, but if you do, you'll figure it out quicker this way. [acme@doppio linux-2.6-tip]$ perf report | head -15 # Samples: 3796 # # Overhead Command Shared Object Symbol # ........ ....... ................................................................... ...... # 23.55% pidgin /lib64/libglib-2.0.so.0.2000.4.#prelink#.Pd98lu (deleted) [.] 0x00000000038844 21.55% pidgin /lib64/libpthread-2.10.1.so.#prelink#.AFwK8Q (deleted) [.] 0x0000000000a42d 10.85% pidgin [kernel] [.] vread_hpet 7.85% pidgin /lib64/libgobject-2.0.so.0.2000.4.#prelink#.o1vpU7 (deleted) [.] 0x00000000014de8 3.35% pidgin /lib64/libc-2.10.1.so (deleted) [.] 0x0000000007a875 3.19% pidgin /lib64/libdbus-1.so.3.4.0.#prelink#.6mwgZP (deleted) [.] 0x0000000001d254 3.06% pidgin /usr/lib64/libgtk-x11-2.0.so.0.1600.5.#prelink#.511hAl (deleted) [.] 0x000000002334e7 2.90% pidgin /usr/lib64/libgdk-x11-2.0.so.0.1600.5.#prelink#.5qlMo1 (deleted) [.] 0x00000000037b2d 1.84% pidgin [kernel] [k] do_sys_poll 1.45% pidgin /usr/lib64/libX11.so.6.2.0.#prelink#.iR59Rx (deleted) [.] 0x0000000004c751 [acme@doppio linux-2.6-tip]$ Signed-off-by: Arnaldo Carvalho de Melo Cc: Luis Claudio R. Gonçalves Cc: Clark Williams Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Frédéric Weisbecker LKML-Reference: <20090811200436.GA3478@ghostprotocols.net> Signed-off-by: Ingo Molnar --- tools/perf/util/symbol.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 2473fd427be..5c0f42e6b33 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -801,6 +801,8 @@ more: } out: free(name); + if (ret < 0 && strstr(self->name, " (deleted)") != NULL) + return 0; return ret; } From 0a5ac84650fb7a7f226814103d95724e34b012ae Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 12 Aug 2009 11:18:01 +0200 Subject: [PATCH 38/60] perf record: Add missing -C option support for specifying profile cpu perf top supports a -C for setting the profile CPU, but perf record does not. This adds the same option for perf record, allowing the user to specify a specific target profile CPU. Signed-off-by: Jens Axboe Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20090812091801.GC12579@kernel.dk> Signed-off-by: Ingo Molnar --- tools/perf/builtin-record.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 30b83def03d..de76008fbaa 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -35,6 +35,7 @@ static const char *output_name = "perf.data"; static int group = 0; static unsigned int realtime_prio = 0; static int system_wide = 0; +static int profile_cpu = -1; static pid_t target_pid = -1; static int inherit = 1; static int force = 0; @@ -431,6 +432,8 @@ try_again: if (err == EPERM) die("Permission error - are you root?\n"); + else if (err == ENODEV && profile_cpu != -1) + die("No such device - did you specify an out-of-range profile CPU?\n"); /* * If it's cycles then fall back to hrtimer @@ -564,9 +567,15 @@ static int __cmd_record(int argc, const char **argv) if (pid == -1) pid = getpid(); - open_counters(-1, pid); - } else for (i = 0; i < nr_cpus; i++) - open_counters(i, target_pid); + open_counters(profile_cpu, pid); + } else { + if (profile_cpu != -1) { + open_counters(profile_cpu, target_pid); + } else { + for (i = 0; i < nr_cpus; i++) + open_counters(i, target_pid); + } + } if (file_new) perf_header__write(header, output); @@ -645,6 +654,8 @@ static const struct option options[] = { "system-wide collection from all CPUs"), OPT_BOOLEAN('A', "append", &append_file, "append to the output file to do incremental profiling"), + OPT_INTEGER('C', "profile_cpu", &profile_cpu, + "CPU to profile on"), OPT_BOOLEAN('f', "force", &force, "overwrite existing data file"), OPT_LONG('c', "count", &default_interval, From 04da8a43da804723a550f00dd158fd5b5e25ae35 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Aug 2009 10:40:08 +0200 Subject: [PATCH 39/60] perf_counter, x86: Fix/improve apic fallback Johannes Stezenbach reported that his Pentium-M based laptop does not have the local APIC enabled by default, and hence perfcounters do not get initialized. Add a fallback for this case: allow non-sampled counters and return with an error on sampled counters. This allows 'perf stat' to work out of box - and allows 'perf top' and 'perf record' to fall back on a hrtimer based sampling method. ( Passing 'lapic' on the boot line will allow hardware sampling to occur - but if the APIC is disabled permanently by the hardware then this fallback still allows more systems to use perfcounters. ) Also decouple perfcounter support from X86_LOCAL_APIC. -v2: fix typo breaking counters on all other systems ... Reported-by: Johannes Stezenbach Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/kernel/cpu/perf_counter.c | 34 ++++++++++++++++++++++++++---- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 738bdc6b0f8..13ffa5df37d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,6 +24,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE + select HAVE_PERF_COUNTERS if (!M386 && !M486) select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB @@ -742,7 +743,6 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC - select HAVE_PERF_COUNTERS if (!M386 && !M486) config X86_IO_APIC def_bool y diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fffc126dbdf..900332b800f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -55,6 +55,7 @@ struct x86_pmu { int num_counters_fixed; int counter_bits; u64 counter_mask; + int apic; u64 max_period; u64 intel_ctrl; }; @@ -613,6 +614,7 @@ static DEFINE_MUTEX(pmc_reserve_mutex); static bool reserve_pmc_hardware(void) { +#ifdef CONFIG_X86_LOCAL_APIC int i; if (nmi_watchdog == NMI_LOCAL_APIC) @@ -627,9 +629,11 @@ static bool reserve_pmc_hardware(void) if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } +#endif return true; +#ifdef CONFIG_X86_LOCAL_APIC eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu.eventsel + i); @@ -644,10 +648,12 @@ perfctr_fail: enable_lapic_nmi_watchdog(); return false; +#endif } static void release_pmc_hardware(void) { +#ifdef CONFIG_X86_LOCAL_APIC int i; for (i = 0; i < x86_pmu.num_counters; i++) { @@ -657,6 +663,7 @@ static void release_pmc_hardware(void) if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); +#endif } static void hw_perf_counter_destroy(struct perf_counter *counter) @@ -748,6 +755,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; atomic64_set(&hwc->period_left, hwc->sample_period); + } else { + /* + * If we have a PMU initialized but no APIC + * interrupts, we cannot sample hardware + * counters (user-space has to fall back and + * sample via a hrtimer based software counter): + */ + if (!x86_pmu.apic) + return -EOPNOTSUPP; } counter->destroy = hw_perf_counter_destroy; @@ -1449,18 +1465,22 @@ void smp_perf_pending_interrupt(struct pt_regs *regs) void set_perf_counter_pending(void) { +#ifdef CONFIG_X86_LOCAL_APIC apic->send_IPI_self(LOCAL_PENDING_VECTOR); +#endif } void perf_counters_lapic_init(void) { - if (!x86_pmu_initialized()) +#ifdef CONFIG_X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) return; /* * Always use NMI for PMU */ apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif } static int __kprobes @@ -1484,7 +1504,9 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; +#ifdef CONFIG_X86_LOCAL_APIC apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif /* * Can't rely on the handled return value to say it was our NMI, two * counters could trigger 'simultaneously' raising two back-to-back NMIs. @@ -1515,6 +1537,7 @@ static struct x86_pmu p6_pmu = { .event_map = p6_pmu_event_map, .raw_event = p6_pmu_raw_event, .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, .max_period = (1ULL << 31) - 1, .version = 0, .num_counters = 2, @@ -1541,6 +1564,7 @@ static struct x86_pmu intel_pmu = { .event_map = intel_pmu_event_map, .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -1564,6 +1588,7 @@ static struct x86_pmu amd_pmu = { .num_counters = 4, .counter_bits = 48, .counter_mask = (1ULL << 48) - 1, + .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, }; @@ -1589,13 +1614,14 @@ static int p6_pmu_init(void) return -ENODEV; } + x86_pmu = p6_pmu; + if (!cpu_has_apic) { pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); - return -ENODEV; + pr_info("no hardware sampling interrupt available.\n"); + x86_pmu.apic = 0; } - x86_pmu = p6_pmu; - return 0; } From 51d5668cb2e3fd1827a55184e48606fff054c5be Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Aug 2009 09:54:02 +1000 Subject: [PATCH 40/60] md: never advance 'events' counter by more than 1. When assembling arrays, md allows two devices to have different event counts as long as the difference is only '1'. This is to cope with a system failure between updating the metadata on two difference devices. However there are currently times when we update the event count by 2. This was done to keep the event count even when the array is clean and odd when it is dirty, which allows us to avoid writing common update to spare devices and so allow those spares to go to sleep. This is bad for the above reason. So change it to never increase by two. This means that the alignment between 'odd/even' and 'clean/dirty' might take a little longer to attain, but that is only a small cost. The spares will get a few more updates but that will still be spared (;-) most updates and can still go to sleep. Prior to this patch there was a small chance that after a crash an array would fail to assemble due to the overly large event count mismatch. Signed-off-by: NeilBrown --- drivers/md/md.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 5614500092e..d18805fea11 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1975,17 +1975,14 @@ repeat: /* otherwise we have to go forward and ... */ mddev->events ++; if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ - /* .. if the array isn't clean, insist on an odd 'events' */ - if ((mddev->events&1)==0) { - mddev->events++; + /* .. if the array isn't clean, an 'even' event must also go + * to spares. */ + if ((mddev->events&1)==0) nospares = 0; - } } else { - /* otherwise insist on an even 'events' (for clean states) */ - if ((mddev->events&1)) { - mddev->events++; + /* otherwise an 'odd' event must go to spares */ + if ((mddev->events&1)) nospares = 0; - } } } From 67ac6011db5d2b0c853d573ff474b25c85dfb644 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Aug 2009 10:06:24 +1000 Subject: [PATCH 41/60] md/raid5: allow new reshape modes to be restarted in the middle. md/raid5 doesn't allow a reshape to restart if it involves writing over the same part of disk that it would be reading from. This happens at the beginning of a reshape that increases the number of devices, at the end of a reshape that decreases the number of devices, and continuously for a reshape that does not change the number of devices. The current code is correct for the "increase number of devices" case as the critical section at the start is handled by userspace performing a backup. It does not work for reducing the number of devices, or the no-change case. For 'reducing', we need to invert the test. For no-change we cannot really be sure things will be safe, so simply require the array to be read-only, which is how the user-space code which carefully starts such arrays works. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2b521ee67df..b8a22a2205c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4509,7 +4509,26 @@ static int run(mddev_t *mddev) (old_disks-max_degraded)); /* here_old is the first stripe that we might need to read * from */ - if (here_new >= here_old) { + if (mddev->delta_disks == 0) { + /* We cannot be sure it is safe to start an in-place + * reshape. It is only safe if user-space if monitoring + * and taking constant backups. + * mdadm always starts a situation like this in + * readonly mode so it can take control before + * allowing any writes. So just check for that. + */ + if ((here_new * mddev->new_chunk_sectors != + here_old * mddev->chunk_sectors) || + mddev->ro == 0) { + printk(KERN_ERR "raid5: in-place reshape must be started" + " in read-only mode - aborting\n"); + return -EINVAL; + } + } else if (mddev->delta_disks < 0 + ? (here_new * mddev->new_chunk_sectors <= + here_old * mddev->chunk_sectors) + : (here_new * mddev->new_chunk_sectors >= + here_old * mddev->chunk_sectors)) { /* Reading from the same stripe as writing to - bad */ printk(KERN_ERR "raid5: reshape_position too early for " "auto-recovery - aborting.\n"); From a639755cf885e437b2fe4168d35157fa90d530ab Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Aug 2009 10:13:00 +1000 Subject: [PATCH 42/60] md/raid5: make sure a reshape restarts at the correct address. This "if" don't allow for the possibility that the number of devices doesn't change, and so sector_nr isn't set correctly in that case. So change '>' to '>='. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b8a22a2205c..94a74cb5ccc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3785,7 +3785,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped conf->reshape_progress < raid5_size(mddev, 0, 0)) { sector_nr = raid5_size(mddev, 0, 0) - conf->reshape_progress; - } else if (mddev->delta_disks > 0 && + } else if (mddev->delta_disks >= 0 && conf->reshape_progress > 0) sector_nr = conf->reshape_progress; sector_div(sector_nr, new_data_disks); From 1a67dde0abba36421a1257d01ba9de2f6d1c160a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Aug 2009 10:41:49 +1000 Subject: [PATCH 43/60] md/raid5: Properly remove excess drives after shrinking a raid5/6 We were removing the drives, from the array, but not removing symlinks from /sys/.... and not marking the device as having been removed. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 94a74cb5ccc..b8a2c5dc67b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5097,8 +5097,15 @@ static void raid5_finish_reshape(mddev_t *mddev) mddev->degraded--; for (d = conf->raid_disks ; d < conf->raid_disks - mddev->delta_disks; - d++) - raid5_remove_disk(mddev, d); + d++) { + mdk_rdev_t *rdev = conf->disks[d].rdev; + if (rdev && raid5_remove_disk(mddev, d) == 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + rdev->raid_disk = -1; + } + } } mddev->layout = conf->algorithm; mddev->chunk_sectors = conf->chunk_sectors; From 4d484a4a7a5126410eed5f8dd329a33f6eeed068 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Aug 2009 10:41:50 +1000 Subject: [PATCH 44/60] md: allow upper limit for resync/reshape to be set when array is read-only Normally we only allow the upper limit for a reshape to be decreased when the array not performing a sync/recovery/reshape, otherwise there could be races. But if an array is part-way through a reshape when it is assembled the reshape is started immediately leaving no window to set an upper bound. If the array is started read-only, the reshape will be suspended until the array becomes writable, so that provides a window during which it is perfectly safe to reduce the upper limit of a reshape. So: allow the upper limit (sync_max) to be reduced even if the reshape thread is running, as long as the array is still read-only. Signed-off-by: NeilBrown --- drivers/md/md.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index d18805fea11..103f2d33fa8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3599,6 +3599,7 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len) if (max < mddev->resync_min) return -EINVAL; if (max < mddev->resync_max && + mddev->ro == 0 && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; From 8f7a0dc51674ad0dd06155291b0aed60d655943c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Aug 2009 14:44:59 -0300 Subject: [PATCH 45/60] perf list: Fix large list output by using the pager When /sys/kernel/debug is mounted the list can be imense, so use the pager like the other tools. Signed-off-by: Arnaldo Carvalho de Melo Acked-by: Frederic Weisbecker Cc: Peter Zijlstra LKML-Reference: <20090812174459.GB3495@ghostprotocols.net> Signed-off-by: Ingo Molnar --- tools/perf/builtin-list.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index f990fa8a35c..d88c6961274 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -10,11 +10,12 @@ #include "perf.h" -#include "util/parse-options.h" #include "util/parse-events.h" +#include "util/cache.h" int cmd_list(int argc __used, const char **argv __used, const char *prefix __used) { + setup_pager(); print_events(); return 0; } From 28402971d869e26271b25301011f667d3a5640c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 Aug 2009 10:13:22 +0200 Subject: [PATCH 46/60] perf_counter: Provide hw_perf_counter_setup_online() APIs Provide weak aliases for hw_perf_counter_setup_online(). This is used by the BTS patches (for v2.6.32), but it interacts with fixes so propagate this upstream. (it has no effect as of yet) Also export perf_counter_output() to architecture code. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a9d823a93fe..2b36afe6ae0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -694,6 +694,8 @@ struct perf_sample_data { extern int perf_counter_overflow(struct perf_counter *counter, int nmi, struct perf_sample_data *data); +extern void perf_counter_output(struct perf_counter *counter, int nmi, + struct perf_sample_data *data); /* * Return 1 for a software counter, 0 for a hardware counter diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b0b20a07f39..e26d2fcfa32 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -88,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } void __weak hw_perf_counter_setup(int cpu) { barrier(); } +void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, @@ -2630,7 +2631,7 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } -static void perf_counter_output(struct perf_counter *counter, int nmi, +void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { int ret; @@ -4592,6 +4593,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) perf_counter_init_cpu(cpu); break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_counter_setup_online(cpu); + break; + case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: perf_counter_exit_cpu(cpu); @@ -4616,6 +4622,8 @@ void __init perf_counter_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); register_cpu_notifier(&perf_cpu_nb); } From 3a9f131fb00b8ac5950a11ad1599e45edfb5ae44 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 13 Aug 2009 10:27:18 +0200 Subject: [PATCH 47/60] perf tools: Add a per tracepoint counter attribute to get raw sample Add a new flag field while opening a tracepoint perf counter: -e tracepoint_subsystem:tracepoint_name:flags This is intended to be generic although for now it only supports the r[e[c[o[r[d]]]]] flag: ./perf record -e workqueue:workqueue_insertion:record ./perf record -e workqueue:workqueue_insertion:r will have the same effect: enabling the raw samples record for the given tracepoint counter. In the future, we may want to support further flags, separated by commas. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith LKML-Reference: <1250152039-7284-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- tools/perf/builtin-record.c | 2 +- tools/perf/util/parse-events.c | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index de76008fbaa..78adc47da86 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -398,7 +398,7 @@ static void create_counter(int counter, int cpu, pid_t pid) PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_ID; - attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; + attr->sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID; if (freq) { attr->sample_type |= PERF_SAMPLE_PERIOD; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 4858d83b3b6..04417840878 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -379,6 +379,7 @@ static int parse_tracepoint_event(const char **strp, struct perf_counter_attr *attr) { const char *evt_name; + char *flags; char sys_name[MAX_EVENT_LENGTH]; char id_buf[4]; int fd; @@ -400,6 +401,15 @@ static int parse_tracepoint_event(const char **strp, strncpy(sys_name, *strp, sys_length); sys_name[sys_length] = '\0'; evt_name = evt_name + 1; + + flags = strchr(evt_name, ':'); + if (flags) { + *flags = '\0'; + flags++; + if (!strncmp(flags, "record", strlen(flags))) + attr->sample_type |= PERF_SAMPLE_RAW; + } + evt_length = strlen(evt_name); if (evt_length >= MAX_EVENT_LENGTH) return 0; From daac07b2e6b77f1bd44104aa2f0593e5505f27d4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 13 Aug 2009 10:27:19 +0200 Subject: [PATCH 48/60] perf tools: Add a general option to enable raw sample records While we can enable the perf sample records per tracepoint counter, we may also want to enable this option for every tracepoint counters to open, so that we don't need to add a :record flag for all of them. Add the -R, --raw-samples options for this purpose. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith LKML-Reference: <1250152039-7284-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- tools/perf/builtin-record.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 78adc47da86..3d051b9cf25 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -34,6 +34,7 @@ static int output; static const char *output_name = "perf.data"; static int group = 0; static unsigned int realtime_prio = 0; +static int raw_samples = 0; static int system_wide = 0; static int profile_cpu = -1; static pid_t target_pid = -1; @@ -418,6 +419,8 @@ static void create_counter(int counter, int cpu, pid_t pid) if (call_graph) attr->sample_type |= PERF_SAMPLE_CALLCHAIN; + if (raw_samples) + attr->sample_type |= PERF_SAMPLE_RAW; attr->mmap = track; attr->comm = track; @@ -650,6 +653,8 @@ static const struct option options[] = { "record events on existing pid"), OPT_INTEGER('r', "realtime", &realtime_prio, "collect data with this RT SCHED_FIFO priority"), + OPT_BOOLEAN('R', "raw-samples", &raw_samples, + "collect raw sample records from all opened counters"), OPT_BOOLEAN('a', "all-cpus", &system_wide, "system-wide collection from all CPUs"), OPT_BOOLEAN('A', "append", &append_file, From 8fd101f20bdf771949a8f3a5a779877d09b2fb56 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Aug 2009 18:19:57 -0300 Subject: [PATCH 49/60] perf report: Don't show unresolved DSOs and symbols when -S/-d is used We're interested in just those symbols/DSOs, so filter out the unresolved ones. Signed-off-by: Arnaldo Carvalho de Melo Cc: Peter Zijlstra LKML-Reference: <20090812211957.GE3495@ghostprotocols.net> Signed-off-by: Ingo Molnar --- tools/perf/builtin-report.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 23e1457f240..b53a60fc12d 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1590,10 +1590,11 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) if (show & show_mask) { struct symbol *sym = resolve_symbol(thread, &map, &dso, &ip); - if (dso_list && dso && dso->name && !strlist__has_entry(dso_list, dso->name)) + if (dso_list && (!dso || !dso->name || + !strlist__has_entry(dso_list, dso->name))) return 0; - if (sym_list && sym && !strlist__has_entry(sym_list, sym->name)) + if (sym_list && (!sym || !strlist__has_entry(sym_list, sym->name))) return 0; if (hist_entry__add(thread, map, dso, sym, ip, chain, level, period)) { From bcfc2602e8541ac13b1def38e2591dca072cff7a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 09:51:55 +0200 Subject: [PATCH 50/60] perf_counter: Fix swcounter context invariance perf_swcounter_is_counting() uses a lock, which means we cannot use swcounters from NMI or when holding that particular lock, this is unintended. The below removes the lock, this opens up race window, but not worse than the swcounters already experience due to RCU traversal of the context in perf_swcounter_ctx_event(). This also fixes the hard lockups while opening a lockdep tracepoint counter. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Paul Mackerras Cc: stephane eranian Cc: Corey J Ashford LKML-Reference: <1250149915.10001.66.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 44 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e26d2fcfa32..3dd4339589a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3444,40 +3444,32 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, static int perf_swcounter_is_counting(struct perf_counter *counter) { - struct perf_counter_context *ctx; - unsigned long flags; - int count; - + /* + * The counter is active, we're good! + */ if (counter->state == PERF_COUNTER_STATE_ACTIVE) return 1; + /* + * The counter is off/error, not counting. + */ if (counter->state != PERF_COUNTER_STATE_INACTIVE) return 0; /* - * If the counter is inactive, it could be just because - * its task is scheduled out, or because it's in a group - * which could not go on the PMU. We want to count in - * the first case but not the second. If the context is - * currently active then an inactive software counter must - * be the second case. If it's not currently active then - * we need to know whether the counter was active when the - * context was last active, which we can determine by - * comparing counter->tstamp_stopped with ctx->time. - * - * We are within an RCU read-side critical section, - * which protects the existence of *ctx. + * The counter is inactive, if the context is active + * we're part of a group that didn't make it on the 'pmu', + * not counting. */ - ctx = counter->ctx; - spin_lock_irqsave(&ctx->lock, flags); - count = 1; - /* Re-check state now we have the lock */ - if (counter->state < PERF_COUNTER_STATE_INACTIVE || - counter->ctx->is_active || - counter->tstamp_stopped < ctx->time) - count = 0; - spin_unlock_irqrestore(&ctx->lock, flags); - return count; + if (counter->ctx->is_active) + return 0; + + /* + * We're inactive and the context is too, this means the + * task is scheduled out, we're counting events that happen + * to us, like migration events. + */ + return 1; } static int perf_swcounter_match(struct perf_counter *counter, From 3dab77fb1bf89664bb1c9544607159dcab6f7d57 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 11:47:53 +0200 Subject: [PATCH 51/60] perf: Rework/fix the whole read vs group stuff Replace PERF_SAMPLE_GROUP with PERF_SAMPLE_READ and introduce PERF_FORMAT_GROUP to deal with group reads in a more generic way. This allows you to get group reads out of read() as well. Signed-off-by: Peter Zijlstra Cc: Corey J Ashford Cc: Paul Mackerras Cc: stephane eranian LKML-Reference: <20090813103655.117411814@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 47 ++++-- kernel/perf_counter.c | 274 ++++++++++++++++++++++++++--------- 2 files changed, 238 insertions(+), 83 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2b36afe6ae0..b53f7006cc4 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -115,7 +115,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_TID = 1U << 1, PERF_SAMPLE_TIME = 1U << 2, PERF_SAMPLE_ADDR = 1U << 3, - PERF_SAMPLE_GROUP = 1U << 4, + PERF_SAMPLE_READ = 1U << 4, PERF_SAMPLE_CALLCHAIN = 1U << 5, PERF_SAMPLE_ID = 1U << 6, PERF_SAMPLE_CPU = 1U << 7, @@ -127,16 +127,32 @@ enum perf_counter_sample_format { }; /* - * Bits that can be set in attr.read_format to request that - * reads on the counter should return the indicated quantities, - * in increasing order of bit value, after the counter value. + * The format of the data returned by read() on a perf counter fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; */ enum perf_counter_read_format { PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, - PERF_FORMAT_MAX = 1U << 3, /* non-ABI */ + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ }; #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ @@ -343,10 +359,8 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u32 pid, tid; - * u64 value; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 parent_id; } && PERF_FORMAT_ID + * + * struct read_format values; * }; */ PERF_EVENT_READ = 8, @@ -364,11 +378,22 @@ enum perf_event_type { * { u32 cpu, res; } && PERF_SAMPLE_CPU * { u64 period; } && PERF_SAMPLE_PERIOD * - * { u64 nr; - * { u64 id, val; } cnt[nr]; } && PERF_SAMPLE_GROUP + * { struct read_format values; } && PERF_SAMPLE_READ * * { u64 nr, * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * * { u32 size; * char data[size];}&& PERF_SAMPLE_RAW * }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3dd4339589a..b8c6b97a20a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1692,7 +1692,32 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } -static u64 perf_counter_read_tree(struct perf_counter *counter) +static int perf_counter_read_size(struct perf_counter *counter) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_GROUP) { + nr += counter->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + + return size; +} + +static u64 perf_counter_read_value(struct perf_counter *counter) { struct perf_counter *child; u64 total = 0; @@ -1704,14 +1729,96 @@ static u64 perf_counter_read_tree(struct perf_counter *counter) return total; } +static int perf_counter_read_entry(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + int n = 0, count = 0; + u64 values[2]; + + values[n++] = perf_counter_read_value(counter); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; +} + +static int perf_counter_read_group(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + struct perf_counter *leader = counter->group_leader, *sub; + int n = 0, size = 0, err = -EFAULT; + u64 values[3]; + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + return -EFAULT; + + err = perf_counter_read_entry(leader, read_format, buf + size); + if (err < 0) + return err; + + size += err; + + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + err = perf_counter_read_entry(counter, read_format, + buf + size); + if (err < 0) + return err; + + size += err; + } + + return size; +} + +static int perf_counter_read_one(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + u64 values[4]; + int n = 0; + + values[n++] = perf_counter_read_value(counter); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + /* * Read the performance counter - simple non blocking version for now */ static ssize_t perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { - u64 values[4]; - int n; + u64 read_format = counter->attr.read_format; + int ret; /* * Return end-of-file for a read on a counter that is in @@ -1721,28 +1828,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (counter->state == PERF_COUNTER_STATE_ERROR) return 0; + if (count < perf_counter_read_size(counter)) + return -ENOSPC; + WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->child_mutex); - values[0] = perf_counter_read_tree(counter); - n = 1; - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - if (counter->attr.read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_counter_read_group(counter, read_format, buf); + else + ret = perf_counter_read_one(counter, read_format, buf); mutex_unlock(&counter->child_mutex); - if (count < n * sizeof(u64)) - return -EINVAL; - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; + return ret; } static ssize_t @@ -2631,6 +2728,79 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + u64 read_format = counter->attr.read_format; + u64 values[4]; + int n = 0; + + values[n++] = atomic64_read(&counter->count); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + struct perf_counter *leader = counter->group_leader, *sub; + u64 read_format = counter->attr.read_format; + u64 values[5]; + int n = 0; + + values[n++] = 1 + leader->nr_siblings; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = leader->total_time_enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = leader->total_time_running; + + if (leader != counter) + leader->pmu->read(leader); + + values[n++] = atomic64_read(&leader->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + n = 0; + + if (sub != counter) + sub->pmu->read(sub); + + values[n++] = atomic64_read(&sub->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); + } +} + +static void perf_output_read(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + if (counter->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, counter); + else + perf_output_read_one(handle, counter); +} + void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { @@ -2642,10 +2812,6 @@ void perf_counter_output(struct perf_counter *counter, int nmi, struct { u32 pid, tid; } tid_entry; - struct { - u64 id; - u64 counter; - } group_entry; struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; u64 time; @@ -2700,10 +2866,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_PERIOD) header.size += sizeof(u64); - if (sample_type & PERF_SAMPLE_GROUP) { - header.size += sizeof(u64) + - counter->nr_siblings * sizeof(group_entry); - } + if (sample_type & PERF_SAMPLE_READ) + header.size += perf_counter_read_size(counter); if (sample_type & PERF_SAMPLE_CALLCHAIN) { callchain = perf_callchain(data->regs); @@ -2760,26 +2924,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_PERIOD) perf_output_put(&handle, data->period); - /* - * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. - */ - if (sample_type & PERF_SAMPLE_GROUP) { - struct perf_counter *leader, *sub; - u64 nr = counter->nr_siblings; - - perf_output_put(&handle, nr); - - leader = counter->group_leader; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - if (sub != counter) - sub->pmu->read(sub); - - group_entry.id = primary_counter_id(sub); - group_entry.counter = atomic64_read(&sub->count); - - perf_output_put(&handle, group_entry); - } - } + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(&handle, counter); if (sample_type & PERF_SAMPLE_CALLCHAIN) { if (callchain) @@ -2818,8 +2964,6 @@ struct perf_read_event { u32 pid; u32 tid; - u64 value; - u64 format[3]; }; static void @@ -2831,34 +2975,20 @@ perf_counter_read_event(struct perf_counter *counter, .header = { .type = PERF_EVENT_READ, .misc = 0, - .size = sizeof(event) - sizeof(event.format), + .size = sizeof(event) + perf_counter_read_size(counter), }, .pid = perf_counter_pid(counter, task), .tid = perf_counter_tid(counter, task), - .value = atomic64_read(&counter->count), }; - int ret, i = 0; - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - event.header.size += sizeof(u64); - event.format[i++] = counter->total_time_enabled; - } - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - event.header.size += sizeof(u64); - event.format[i++] = counter->total_time_running; - } - - if (counter->attr.read_format & PERF_FORMAT_ID) { - event.header.size += sizeof(u64); - event.format[i++] = primary_counter_id(counter); - } + int ret; ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); if (ret) return; - perf_output_copy(&handle, &event, event.header.size); + perf_output_put(&handle, event); + perf_output_read(&handle, counter); + perf_output_end(&handle); } @@ -3921,9 +4051,9 @@ perf_counter_alloc(struct perf_counter_attr *attr, atomic64_set(&hwc->period_left, hwc->sample_period); /* - * we currently do not support PERF_SAMPLE_GROUP on inherited counters + * we currently do not support PERF_FORMAT_GROUP on inherited counters */ - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto done; switch (attr->type) { From 970892a9031a5dc7217bd394fb9d89fa75a4a7bc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 11:47:54 +0200 Subject: [PATCH 52/60] perf_counter: Fix an ipi-deadlock perf_pending_counter() is called from IRQ context and will call perf_counter_disable(), however perf_counter_disable() uses smp_call_function_single() which doesn't fancy being used with IRQs disabled due to IPI deadlocks. Fix this by making it use the local __perf_counter_disable() call and teaching the counter_sched_out() code about pending disables as well. This should cover the case where a counter migrates before the pending queue gets processed. Signed-off-by: Peter Zijlstra Cc: Corey J Ashford Cc: Paul Mackerras Cc: stephane eranian LKML-Reference: <20090813103655.244097721@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b8c6b97a20a..3f841beefc7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -307,6 +307,10 @@ counter_sched_out(struct perf_counter *counter, return; counter->state = PERF_COUNTER_STATE_INACTIVE; + if (counter->pending_disable) { + counter->pending_disable = 0; + counter->state = PERF_COUNTER_STATE_OFF; + } counter->tstamp_stopped = ctx->time; counter->pmu->disable(counter); counter->oncpu = -1; @@ -2343,7 +2347,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry) if (counter->pending_disable) { counter->pending_disable = 0; - perf_counter_disable(counter); + __perf_counter_disable(counter); } if (counter->pending_wakeup) { From 94d5d1b2d891f1fd5205f978246b7864d998b25c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 16:14:42 +0200 Subject: [PATCH 53/60] perf_counter: Report the cloning task as parent on perf_counter_fork() A bug in (9f498cc: perf_counter: Full task tracing) makes profiling multi-threaded apps it go belly up. [ output as: (PID:TID):(PPID:PTID) ] # ./perf report -D | grep FORK 0x4b0 [0x18]: PERF_EVENT_FORK: (3237:3237):(3236:3236) 0xa10 [0x18]: PERF_EVENT_FORK: (3237:3238):(3236:3236) 0xa70 [0x18]: PERF_EVENT_FORK: (3237:3239):(3236:3236) 0xad0 [0x18]: PERF_EVENT_FORK: (3237:3240):(3236:3236) 0xb18 [0x18]: PERF_EVENT_FORK: (3237:3241):(3236:3236) Shows us that the test (27d028d perf report: Update for the new FORK/EXIT events) in builtin-report.c: /* * A thread clone will have the same PID for both * parent and child. */ if (thread == parent) return 0; Will clearly fail. The problem is that perf_counter_fork() reports the actual parent, instead of the cloning thread. Fixing that (with the below patch), yields: # ./perf report -D | grep FORK 0x4c8 [0x18]: PERF_EVENT_FORK: (1590:1590):(1589:1589) 0xbd8 [0x18]: PERF_EVENT_FORK: (1590:1591):(1590:1590) 0xc80 [0x18]: PERF_EVENT_FORK: (1590:1592):(1590:1590) 0x3338 [0x18]: PERF_EVENT_FORK: (1590:1593):(1590:1590) 0x66b0 [0x18]: PERF_EVENT_FORK: (1590:1594):(1590:1590) Which both makes more sense and doesn't confuse perf report anymore. Reported-by: Pekka Enberg Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: paulus@samba.org Cc: Anton Blanchard Cc: Arjan van de Ven LKML-Reference: <1250172882.5241.62.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3f841beefc7..534e20d14d6 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3028,10 +3028,10 @@ static void perf_counter_task_output(struct perf_counter *counter, return; task_event->event.pid = perf_counter_pid(counter, task); - task_event->event.ppid = perf_counter_pid(counter, task->real_parent); + task_event->event.ppid = perf_counter_pid(counter, current); task_event->event.tid = perf_counter_tid(counter, task); - task_event->event.ptid = perf_counter_tid(counter, task->real_parent); + task_event->event.ptid = perf_counter_tid(counter, current); perf_output_put(&handle, task_event->event); perf_output_end(&handle); From e694958388c50148389b0e9b9e9e8945cf0f1b98 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Aug 2009 08:28:36 -0700 Subject: [PATCH 54/60] Make sock_sendpage() use kernel_sendpage() kernel_sendpage() does the proper default case handling for when the socket doesn't have a native sendpage implementation. Now, arguably this might be something that we could instead solve by just specifying that all protocols should do it themselves at the protocol level, but we really only care about the common protocols. Does anybody really care about sendpage on something like Appletalk? Not likely. Acked-by: David S. Miller Acked-by: Julien TINNES Acked-by: Tavis Ormandy Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/socket.c b/net/socket.c index 791d71a36a9..6d471655904 100644 --- a/net/socket.c +++ b/net/socket.c @@ -736,7 +736,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, if (more) flags |= MSG_MORE; - return sock->ops->sendpage(sock, page, offset, size, flags); + return kernel_sendpage(sock, page, offset, size, flags); } static ssize_t sock_splice_read(struct file *file, loff_t *ppos, From 2d860ad76f4ee4d2eba0fe3797c8d7cdce432cc0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Aug 2009 13:05:10 -0700 Subject: [PATCH 55/60] genirq: prevent wakeup of freed irq thread free_irq() can remove an irqaction while the corresponding interrupt is in progress, but free_irq() sets action->thread to NULL unconditionally, which might lead to a NULL pointer dereference in handle_IRQ_event() when the hard interrupt context tries to wake up the handler thread. Prevent this by moving the thread stop after synchronize_irq(). No need to set action->thread to NULL either as action is going to be freed anyway. This fixes a boot crash reported against preempt-rt which uses the mainline irq threads code to implement full irq threading. [ tglx: removed local irqthread variable ] Signed-off-by: Linus Torvalds Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 61c679db468..d222515a5a0 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -761,7 +761,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action, **action_ptr; - struct task_struct *irqthread; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -809,9 +808,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) desc->chip->disable(irq); } - irqthread = action->thread; - action->thread = NULL; - spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -819,12 +815,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Make sure it's not being used on another CPU: */ synchronize_irq(irq); - if (irqthread) { - if (!test_bit(IRQTF_DIED, &action->thread_flags)) - kthread_stop(irqthread); - put_task_struct(irqthread); - } - #ifdef CONFIG_DEBUG_SHIRQ /* * It's a shared IRQ -- the driver ought to be prepared for an IRQ @@ -840,6 +830,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) local_irq_restore(flags); } #endif + + if (action->thread) { + if (!test_bit(IRQTF_DIED, &action->thread_flags)) + kthread_stop(action->thread); + put_task_struct(action->thread); + } + return action; } From 64f1607ffbbc772685733ea63e6f7f4183df1b16 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Aug 2009 15:43:34 -0700 Subject: [PATCH 56/60] Linux 2.6.31-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0d46615bffe..abcfa85f8f8 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 31 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Man-Eating Seals of Antiquity # *DOCUMENTATION* From 21bc1f024d0d4ea43fc0f2a43504e759261c7b18 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Sat, 15 Aug 2009 02:53:16 +0000 Subject: [PATCH 57/60] sh: skip disabled LCDC channels This patch updates the SuperH Mobile LCDC driver to skip over disabled channels. Without this patch suspend-to-ram operation will crash if deferred io is enabled. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/video/sh_mobile_lcdcfb.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/video/sh_mobile_lcdcfb.c b/drivers/video/sh_mobile_lcdcfb.c index 8f24564f77b..07f22b62563 100644 --- a/drivers/video/sh_mobile_lcdcfb.c +++ b/drivers/video/sh_mobile_lcdcfb.c @@ -481,6 +481,9 @@ static int sh_mobile_lcdc_start(struct sh_mobile_lcdc_priv *priv) /* tell the board code to enable the panel */ for (k = 0; k < ARRAY_SIZE(priv->ch); k++) { ch = &priv->ch[k]; + if (!ch->enabled) + continue; + board_cfg = &ch->cfg.board_cfg; if (board_cfg->display_on) board_cfg->display_on(board_cfg->board_data); @@ -498,6 +501,8 @@ static void sh_mobile_lcdc_stop(struct sh_mobile_lcdc_priv *priv) /* clean up deferred io and ask board code to disable panel */ for (k = 0; k < ARRAY_SIZE(priv->ch); k++) { ch = &priv->ch[k]; + if (!ch->enabled) + continue; /* deferred io mode: * flush frame, and wait for frame end interrupt From f6431732f128a241b149c0aa85dfec852455ebf9 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Sat, 15 Aug 2009 02:53:25 +0000 Subject: [PATCH 58/60] sh: CMT suspend/resume This patch updates the SuperH CMT driver with suspend and resume callbacks for the suspend-to-ram case. This patch stops the CMT channel at suspend time to avoid unwanted wake up events. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/clocksource/sh_cmt.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index 2964f5f4a7e..6b3e0c2f33e 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -40,6 +40,7 @@ struct sh_cmt_priv { struct platform_device *pdev; unsigned long flags; + unsigned long flags_suspend; unsigned long match_value; unsigned long next_match_value; unsigned long max_match_value; @@ -667,11 +668,38 @@ static int __devexit sh_cmt_remove(struct platform_device *pdev) return -EBUSY; /* cannot unregister clockevent and clocksource */ } +static int sh_cmt_suspend(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct sh_cmt_priv *p = platform_get_drvdata(pdev); + + /* save flag state and stop CMT channel */ + p->flags_suspend = p->flags; + sh_cmt_stop(p, p->flags); + return 0; +} + +static int sh_cmt_resume(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct sh_cmt_priv *p = platform_get_drvdata(pdev); + + /* start CMT channel from saved state */ + sh_cmt_start(p, p->flags_suspend); + return 0; +} + +static struct dev_pm_ops sh_cmt_dev_pm_ops = { + .suspend = sh_cmt_suspend, + .resume = sh_cmt_resume, +}; + static struct platform_driver sh_cmt_device_driver = { .probe = sh_cmt_probe, .remove = __devexit_p(sh_cmt_remove), .driver = { .name = "sh_cmt", + .pm = &sh_cmt_dev_pm_ops, } }; From 9747e78b304b44d6fb73e2c8071406d55aa8bb75 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Sat, 15 Aug 2009 02:53:34 +0000 Subject: [PATCH 59/60] sh: use in-soc KEYSC on se7724 This patch updates the Solution Engine 7724 board code to use in-SoC KEYSC resources for the keyboard platform device. Using the in-SoC key scan controller fixes a crash-during-resume issue. Without this patch the KEYSC hardware block located in the board specific FPGA is used together with an external IRQ which is routed through the FPGA and handled by some board specific demux code. This board specific FPGA interrupt code does not implement desc->set_wake() so the enable_irq_wake() call in the sh_keysc driver will fail at suspend-to-ram time and the disable_irq_wake() will bomb out when resuming. Changing the platform data to use the in-SoC KEYSC hardware makes the se7724 board support code less special which is a good thing. Also, the board specific KEYSC pin setup code selects in-SoC pin functions already which makes the current FPGA platform device data look like a typo. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/boards/mach-se/7724/setup.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c index 8fed45a2fb8..15456a0773b 100644 --- a/arch/sh/boards/mach-se/7724/setup.c +++ b/arch/sh/boards/mach-se/7724/setup.c @@ -238,7 +238,7 @@ static struct platform_device ceu1_device = { }, }; -/* KEYSC */ +/* KEYSC in SoC (Needs SW33-2 set to ON) */ static struct sh_keysc_info keysc_info = { .mode = SH_KEYSC_MODE_1, .scan_timing = 10, @@ -255,12 +255,13 @@ static struct sh_keysc_info keysc_info = { static struct resource keysc_resources[] = { [0] = { - .start = 0x1a204000, - .end = 0x1a20400f, + .name = "KEYSC", + .start = 0x044b0000, + .end = 0x044b000f, .flags = IORESOURCE_MEM, }, [1] = { - .start = IRQ0_KEY, + .start = 79, .flags = IORESOURCE_IRQ, }, }; From 237674e050ae8ea40a432412df6c15d60b7ae8a6 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Sat, 15 Aug 2009 02:53:42 +0000 Subject: [PATCH 60/60] sh: sh7724 ddr self-refresh changes This patch updates the SuperH Mobile sleep assembly code with support for DBSC memory controller found in the sh7724 processor. Without this fix the memory hooked up to the sh7724 processor will never enter self-refresh mode before suspending to ram. The effect of this is that the memory contents most likeley will be lost upon resume which may or may not be what you want. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/shmobile/sleep.S | 70 ++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 2 deletions(-) diff --git a/arch/sh/kernel/cpu/shmobile/sleep.S b/arch/sh/kernel/cpu/shmobile/sleep.S index 5d888ef53d8..baf2d7d46b0 100644 --- a/arch/sh/kernel/cpu/shmobile/sleep.S +++ b/arch/sh/kernel/cpu/shmobile/sleep.S @@ -26,8 +26,30 @@ ENTRY(sh_mobile_standby) tst #SUSP_SH_SF, r0 bt skip_set_sf +#ifdef CONFIG_CPU_SUBTYPE_SH7724 + /* DBSC: put memory in self-refresh mode */ - /* SDRAM: disable power down and put in self-refresh mode */ + mov.l dben_reg, r4 + mov.l dben_data0, r1 + mov.l r1, @r4 + + mov.l dbrfpdn0_reg, r4 + mov.l dbrfpdn0_data0, r1 + mov.l r1, @r4 + + mov.l dbcmdcnt_reg, r4 + mov.l dbcmdcnt_data0, r1 + mov.l r1, @r4 + + mov.l dbcmdcnt_reg, r4 + mov.l dbcmdcnt_data1, r1 + mov.l r1, @r4 + + mov.l dbrfpdn0_reg, r4 + mov.l dbrfpdn0_data1, r1 + mov.l r1, @r4 +#else + /* SBSC: disable power down and put in self-refresh mode */ mov.l 1f, r4 mov.l 2f, r1 mov.l @r4, r2 @@ -35,6 +57,7 @@ ENTRY(sh_mobile_standby) mov.l 3f, r3 and r3, r2 mov.l r2, @r4 +#endif skip_set_sf: tst #SUSP_SH_SLEEP, r0 @@ -84,7 +107,36 @@ done_sleep: tst #SUSP_SH_SF, r0 bt skip_restore_sf - /* SDRAM: set auto-refresh mode */ +#ifdef CONFIG_CPU_SUBTYPE_SH7724 + /* DBSC: put memory in auto-refresh mode */ + + mov.l dbrfpdn0_reg, r4 + mov.l dbrfpdn0_data0, r1 + mov.l r1, @r4 + + /* sleep 140 ns */ + nop + nop + nop + nop + + mov.l dbcmdcnt_reg, r4 + mov.l dbcmdcnt_data0, r1 + mov.l r1, @r4 + + mov.l dbcmdcnt_reg, r4 + mov.l dbcmdcnt_data1, r1 + mov.l r1, @r4 + + mov.l dben_reg, r4 + mov.l dben_data1, r1 + mov.l r1, @r4 + + mov.l dbrfpdn0_reg, r4 + mov.l dbrfpdn0_data2, r1 + mov.l r1, @r4 +#else + /* SBSC: set auto-refresh mode */ mov.l 1f, r4 mov.l @r4, r2 mov.l 4f, r3 @@ -98,15 +150,29 @@ done_sleep: add r4, r3 or r2, r3 mov.l r3, @r1 +#endif skip_restore_sf: rts nop .balign 4 +#ifdef CONFIG_CPU_SUBTYPE_SH7724 +dben_reg: .long 0xfd000010 /* DBEN */ +dben_data0: .long 0 +dben_data1: .long 1 +dbrfpdn0_reg: .long 0xfd000040 /* DBRFPDN0 */ +dbrfpdn0_data0: .long 0 +dbrfpdn0_data1: .long 1 +dbrfpdn0_data2: .long 0x00010000 +dbcmdcnt_reg: .long 0xfd000014 /* DBCMDCNT */ +dbcmdcnt_data0: .long 2 +dbcmdcnt_data1: .long 4 +#else 1: .long 0xfe400008 /* SDCR0 */ 2: .long 0x00000400 3: .long 0xffff7fff 4: .long 0xfffffbff +#endif 5: .long 0xa4150020 /* STBCR */ 6: .long 0xfe40001c /* RTCOR */ 7: .long 0xfe400018 /* RTCNT */