diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index d8147b336c3..6518a55273e 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -38,6 +38,13 @@ read or write requests. Note that the total allocated number may be twice this amount, since it applies only to reads or writes (not the accumulated sum). +To avoid priority inversion through request starvation, a request +queue maintains a separate request pool per each cgroup when +CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such +per-block-cgroup request pool. IOW, if there are N block cgroups, +each request queue may have upto N request pools, each independently +regulated by nr_requests. + read_ahead_kb (RW) ------------------ Maximum number of kilobytes to read-ahead for filesystems on this block diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e7dee617358..f3b44a65fc7 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -31,27 +31,6 @@ EXPORT_SYMBOL_GPL(blkcg_root); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; -struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) -{ - return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), - struct blkcg, css); -} -EXPORT_SYMBOL_GPL(cgroup_to_blkcg); - -static struct blkcg *task_blkcg(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, blkio_subsys_id), - struct blkcg, css); -} - -struct blkcg *bio_blkcg(struct bio *bio) -{ - if (bio && bio->bi_css) - return container_of(bio->bi_css, struct blkcg, css); - return task_blkcg(current); -} -EXPORT_SYMBOL_GPL(bio_blkcg); - static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { @@ -84,6 +63,7 @@ static void blkg_free(struct blkcg_gq *blkg) kfree(pd); } + blk_exit_rl(&blkg->rl); kfree(blkg); } @@ -91,16 +71,18 @@ static void blkg_free(struct blkcg_gq *blkg) * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with * @q: request_queue the new blkg is associated with + * @gfp_mask: allocation mask to use * * Allocate a new blkg assocating @blkcg and @q. */ -static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) +static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, + gfp_t gfp_mask) { struct blkcg_gq *blkg; int i; /* alloc and init base part */ - blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node); + blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); if (!blkg) return NULL; @@ -109,6 +91,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) blkg->blkcg = blkcg; blkg->refcnt = 1; + /* root blkg uses @q->root_rl, init rl only for !root blkgs */ + if (blkcg != &blkcg_root) { + if (blk_init_rl(&blkg->rl, q, gfp_mask)) + goto err_free; + blkg->rl.blkg = blkg; + } + for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; @@ -117,11 +106,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) continue; /* alloc per-policy data and attach it to blkg */ - pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node); - if (!pd) { - blkg_free(blkg); - return NULL; - } + pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); + if (!pd) + goto err_free; blkg->pd[i] = pd; pd->blkg = blkg; @@ -132,6 +119,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) } return blkg; + +err_free: + blkg_free(blkg); + return NULL; } static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, @@ -175,9 +166,13 @@ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) } EXPORT_SYMBOL_GPL(blkg_lookup); +/* + * If @new_blkg is %NULL, this function tries to allocate a new one as + * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. + */ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) - __releases(q->queue_lock) __acquires(q->queue_lock) + struct request_queue *q, + struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; int ret; @@ -189,24 +184,26 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, blkg = __blkg_lookup(blkcg, q); if (blkg) { rcu_assign_pointer(blkcg->blkg_hint, blkg); - return blkg; + goto out_free; } /* blkg holds a reference to blkcg */ - if (!css_tryget(&blkcg->css)) - return ERR_PTR(-EINVAL); + if (!css_tryget(&blkcg->css)) { + blkg = ERR_PTR(-EINVAL); + goto out_free; + } /* allocate */ - ret = -ENOMEM; - blkg = blkg_alloc(blkcg, q); - if (unlikely(!blkg)) - goto err_put; + if (!new_blkg) { + new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); + if (unlikely(!new_blkg)) { + blkg = ERR_PTR(-ENOMEM); + goto out_put; + } + } + blkg = new_blkg; /* insert */ - ret = radix_tree_preload(GFP_ATOMIC); - if (ret) - goto err_free; - spin_lock(&blkcg->lock); ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); if (likely(!ret)) { @@ -215,15 +212,15 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, } spin_unlock(&blkcg->lock); - radix_tree_preload_end(); - if (!ret) return blkg; -err_free: - blkg_free(blkg); -err_put: + + blkg = ERR_PTR(ret); +out_put: css_put(&blkcg->css); - return ERR_PTR(ret); +out_free: + blkg_free(new_blkg); + return blkg; } struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, @@ -235,7 +232,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, */ if (unlikely(blk_queue_bypass(q))) return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY); - return __blkg_lookup_create(blkcg, q); + return __blkg_lookup_create(blkcg, q, NULL); } EXPORT_SYMBOL_GPL(blkg_lookup_create); @@ -313,6 +310,38 @@ void __blkg_release(struct blkcg_gq *blkg) } EXPORT_SYMBOL_GPL(__blkg_release); +/* + * The next function used by blk_queue_for_each_rl(). It's a bit tricky + * because the root blkg uses @q->root_rl instead of its own rl. + */ +struct request_list *__blk_queue_next_rl(struct request_list *rl, + struct request_queue *q) +{ + struct list_head *ent; + struct blkcg_gq *blkg; + + /* + * Determine the current blkg list_head. The first entry is + * root_rl which is off @q->blkg_list and mapped to the head. + */ + if (rl == &q->root_rl) { + ent = &q->blkg_list; + } else { + blkg = container_of(rl, struct blkcg_gq, rl); + ent = &blkg->q_node; + } + + /* walk to the next list_head, skip root blkcg */ + ent = ent->next; + if (ent == &q->root_blkg->q_node) + ent = ent->next; + if (ent == &q->blkg_list) + return NULL; + + blkg = container_of(ent, struct blkcg_gq, q_node); + return &blkg->rl; +} + static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) { @@ -734,24 +763,36 @@ int blkcg_activate_policy(struct request_queue *q, struct blkcg_gq *blkg; struct blkg_policy_data *pd, *n; int cnt = 0, ret; + bool preloaded; if (blkcg_policy_enabled(q, pol)) return 0; + /* preallocations for root blkg */ + blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + if (!blkg) + return -ENOMEM; + + preloaded = !radix_tree_preload(GFP_KERNEL); + blk_queue_bypass_start(q); /* make sure the root blkg exists and count the existing blkgs */ spin_lock_irq(q->queue_lock); rcu_read_lock(); - blkg = __blkg_lookup_create(&blkcg_root, q); + blkg = __blkg_lookup_create(&blkcg_root, q, blkg); rcu_read_unlock(); + if (preloaded) + radix_tree_preload_end(); + if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); goto out_unlock; } q->root_blkg = blkg; + q->root_rl.blkg = blkg; list_for_each_entry(blkg, &q->blkg_list, q_node) cnt++; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 8ac457ce778..24597309e23 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -17,6 +17,7 @@ #include #include #include +#include /* Max limits for throttle policy */ #define THROTL_IOPS_MAX UINT_MAX @@ -93,6 +94,8 @@ struct blkcg_gq { struct list_head q_node; struct hlist_node blkcg_node; struct blkcg *blkcg; + /* request allocation list for this blkcg-q pair */ + struct request_list rl; /* reference count */ int refcnt; @@ -120,8 +123,6 @@ struct blkcg_policy { extern struct blkcg blkcg_root; -struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup); -struct blkcg *bio_blkcg(struct bio *bio); struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q); @@ -160,6 +161,25 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, void blkg_conf_finish(struct blkg_conf_ctx *ctx); +static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) +{ + return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), + struct blkcg, css); +} + +static inline struct blkcg *task_blkcg(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, blkio_subsys_id), + struct blkcg, css); +} + +static inline struct blkcg *bio_blkcg(struct bio *bio) +{ + if (bio && bio->bi_css) + return container_of(bio->bi_css, struct blkcg, css); + return task_blkcg(current); +} + /** * blkg_to_pdata - get policy private data * @blkg: blkg of interest @@ -233,6 +253,95 @@ static inline void blkg_put(struct blkcg_gq *blkg) __blkg_release(blkg); } +/** + * blk_get_rl - get request_list to use + * @q: request_queue of interest + * @bio: bio which will be attached to the allocated request (may be %NULL) + * + * The caller wants to allocate a request from @q to use for @bio. Find + * the request_list to use and obtain a reference on it. Should be called + * under queue_lock. This function is guaranteed to return non-%NULL + * request_list. + */ +static inline struct request_list *blk_get_rl(struct request_queue *q, + struct bio *bio) +{ + struct blkcg *blkcg; + struct blkcg_gq *blkg; + + rcu_read_lock(); + + blkcg = bio_blkcg(bio); + + /* bypass blkg lookup and use @q->root_rl directly for root */ + if (blkcg == &blkcg_root) + goto root_rl; + + /* + * Try to use blkg->rl. blkg lookup may fail under memory pressure + * or if either the blkcg or queue is going away. Fall back to + * root_rl in such cases. + */ + blkg = blkg_lookup_create(blkcg, q); + if (unlikely(IS_ERR(blkg))) + goto root_rl; + + blkg_get(blkg); + rcu_read_unlock(); + return &blkg->rl; +root_rl: + rcu_read_unlock(); + return &q->root_rl; +} + +/** + * blk_put_rl - put request_list + * @rl: request_list to put + * + * Put the reference acquired by blk_get_rl(). Should be called under + * queue_lock. + */ +static inline void blk_put_rl(struct request_list *rl) +{ + /* root_rl may not have blkg set */ + if (rl->blkg && rl->blkg->blkcg != &blkcg_root) + blkg_put(rl->blkg); +} + +/** + * blk_rq_set_rl - associate a request with a request_list + * @rq: request of interest + * @rl: target request_list + * + * Associate @rq with @rl so that accounting and freeing can know the + * request_list @rq came from. + */ +static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) +{ + rq->rl = rl; +} + +/** + * blk_rq_rl - return the request_list a request came from + * @rq: request of interest + * + * Return the request_list @rq is allocated from. + */ +static inline struct request_list *blk_rq_rl(struct request *rq) +{ + return rq->rl; +} + +struct request_list *__blk_queue_next_rl(struct request_list *rl, + struct request_queue *q); +/** + * blk_queue_for_each_rl - iterate through all request_lists of a request_queue + * + * Should be used under queue_lock. + */ +#define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) + /** * blkg_stat_add - add a value to a blkg_stat * @stat: target blkg_stat @@ -351,6 +460,7 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) #else /* CONFIG_BLK_CGROUP */ struct cgroup; +struct blkcg; struct blkg_policy_data { }; @@ -361,8 +471,6 @@ struct blkcg_gq { struct blkcg_policy { }; -static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } -static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline int blkcg_init_queue(struct request_queue *q) { return 0; } static inline void blkcg_drain_queue(struct request_queue *q) { } @@ -374,6 +482,9 @@ static inline int blkcg_activate_policy(struct request_queue *q, static inline void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol) { } +static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } +static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } + static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } @@ -381,5 +492,14 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } +static inline struct request_list *blk_get_rl(struct request_queue *q, + struct bio *bio) { return &q->root_rl; } +static inline void blk_put_rl(struct request_list *rl) { } +static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } +static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } + +#define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) + #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_H */ diff --git a/block/blk-core.c b/block/blk-core.c index 93eb3e4f88c..dd134d834d5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -387,7 +387,7 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) if (!list_empty(&q->queue_head) && q->request_fn) __blk_run_queue(q); - drain |= q->rq.elvpriv; + drain |= q->nr_rqs_elvpriv; /* * Unfortunately, requests are queued at and tracked from @@ -397,7 +397,7 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) if (drain_all) { drain |= !list_empty(&q->queue_head); for (i = 0; i < 2; i++) { - drain |= q->rq.count[i]; + drain |= q->nr_rqs[i]; drain |= q->in_flight[i]; drain |= !list_empty(&q->flush_queue[i]); } @@ -416,9 +416,14 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) * left with hung waiters. We need to wake up those waiters. */ if (q->request_fn) { + struct request_list *rl; + spin_lock_irq(q->queue_lock); - for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++) - wake_up_all(&q->rq.wait[i]); + + blk_queue_for_each_rl(rl, q) + for (i = 0; i < ARRAY_SIZE(rl->wait); i++) + wake_up_all(&rl->wait[i]); + spin_unlock_irq(q->queue_lock); } } @@ -517,28 +522,33 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); -static int blk_init_free_list(struct request_queue *q) +int blk_init_rl(struct request_list *rl, struct request_queue *q, + gfp_t gfp_mask) { - struct request_list *rl = &q->rq; - if (unlikely(rl->rq_pool)) return 0; + rl->q = q; rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; - rl->elvpriv = 0; init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, - mempool_free_slab, request_cachep, q->node); - + mempool_free_slab, request_cachep, + gfp_mask, q->node); if (!rl->rq_pool) return -ENOMEM; return 0; } +void blk_exit_rl(struct request_list *rl) +{ + if (rl->rq_pool) + mempool_destroy(rl->rq_pool); +} + struct request_queue *blk_alloc_queue(gfp_t gfp_mask) { return blk_alloc_queue_node(gfp_mask, -1); @@ -680,7 +690,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (!q) return NULL; - if (blk_init_free_list(q)) + if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) return NULL; q->request_fn = rfn; @@ -722,15 +732,15 @@ bool blk_get_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_get_queue); -static inline void blk_free_request(struct request_queue *q, struct request *rq) +static inline void blk_free_request(struct request_list *rl, struct request *rq) { if (rq->cmd_flags & REQ_ELVPRIV) { - elv_put_request(q, rq); + elv_put_request(rl->q, rq); if (rq->elv.icq) put_io_context(rq->elv.icq->ioc); } - mempool_free(rq, q->rq.rq_pool); + mempool_free(rq, rl->rq_pool); } /* @@ -767,18 +777,23 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) ioc->last_waited = jiffies; } -static void __freed_request(struct request_queue *q, int sync) +static void __freed_request(struct request_list *rl, int sync) { - struct request_list *rl = &q->rq; + struct request_queue *q = rl->q; - if (rl->count[sync] < queue_congestion_off_threshold(q)) + /* + * bdi isn't aware of blkcg yet. As all async IOs end up root + * blkcg anyway, just use root blkcg state. + */ + if (rl == &q->root_rl && + rl->count[sync] < queue_congestion_off_threshold(q)) blk_clear_queue_congested(q, sync); if (rl->count[sync] + 1 <= q->nr_requests) { if (waitqueue_active(&rl->wait[sync])) wake_up(&rl->wait[sync]); - blk_clear_queue_full(q, sync); + blk_clear_rl_full(rl, sync); } } @@ -786,19 +801,20 @@ static void __freed_request(struct request_queue *q, int sync) * A request has just been released. Account for it, update the full and * congestion status, wake up any waiters. Called under q->queue_lock. */ -static void freed_request(struct request_queue *q, unsigned int flags) +static void freed_request(struct request_list *rl, unsigned int flags) { - struct request_list *rl = &q->rq; + struct request_queue *q = rl->q; int sync = rw_is_sync(flags); + q->nr_rqs[sync]--; rl->count[sync]--; if (flags & REQ_ELVPRIV) - rl->elvpriv--; + q->nr_rqs_elvpriv--; - __freed_request(q, sync); + __freed_request(rl, sync); if (unlikely(rl->starved[sync ^ 1])) - __freed_request(q, sync ^ 1); + __freed_request(rl, sync ^ 1); } /* @@ -837,8 +853,8 @@ static struct io_context *rq_ioc(struct bio *bio) } /** - * get_request - get a free request - * @q: request_queue to allocate request from + * __get_request - get a free request + * @rl: request list to allocate from * @rw_flags: RW and SYNC flags * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask @@ -850,20 +866,16 @@ static struct io_context *rq_ioc(struct bio *bio) * Returns %NULL on failure, with @q->queue_lock held. * Returns !%NULL on success, with @q->queue_lock *not held*. */ -static struct request *get_request(struct request_queue *q, int rw_flags, - struct bio *bio, gfp_t gfp_mask) +static struct request *__get_request(struct request_list *rl, int rw_flags, + struct bio *bio, gfp_t gfp_mask) { + struct request_queue *q = rl->q; struct request *rq; - struct request_list *rl = &q->rq; - struct elevator_type *et; - struct io_context *ioc; + struct elevator_type *et = q->elevator->type; + struct io_context *ioc = rq_ioc(bio); struct io_cq *icq = NULL; const bool is_sync = rw_is_sync(rw_flags) != 0; - bool retried = false; int may_queue; -retry: - et = q->elevator->type; - ioc = rq_ioc(bio); if (unlikely(blk_queue_dead(q))) return NULL; @@ -874,29 +886,15 @@ retry: if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { if (rl->count[is_sync]+1 >= q->nr_requests) { - /* - * We want ioc to record batching state. If it's - * not already there, creating a new one requires - * dropping queue_lock, which in turn requires - * retesting conditions to avoid queue hang. - */ - if (!ioc && !retried) { - spin_unlock_irq(q->queue_lock); - create_io_context(gfp_mask, q->node); - spin_lock_irq(q->queue_lock); - retried = true; - goto retry; - } - /* * The queue will fill after this allocation, so set * it as full, and mark this process as "batching". * This process will be allowed to complete a batch of * requests, others will be blocked. */ - if (!blk_queue_full(q, is_sync)) { + if (!blk_rl_full(rl, is_sync)) { ioc_set_batching(q, ioc); - blk_set_queue_full(q, is_sync); + blk_set_rl_full(rl, is_sync); } else { if (may_queue != ELV_MQUEUE_MUST && !ioc_batching(q, ioc)) { @@ -909,7 +907,12 @@ retry: } } } - blk_set_queue_congested(q, is_sync); + /* + * bdi isn't aware of blkcg yet. As all async IOs end up + * root blkcg anyway, just use root blkcg state. + */ + if (rl == &q->root_rl) + blk_set_queue_congested(q, is_sync); } /* @@ -920,6 +923,7 @@ retry: if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) return NULL; + q->nr_rqs[is_sync]++; rl->count[is_sync]++; rl->starved[is_sync] = 0; @@ -935,7 +939,7 @@ retry: */ if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) { rw_flags |= REQ_ELVPRIV; - rl->elvpriv++; + q->nr_rqs_elvpriv++; if (et->icq_cache && ioc) icq = ioc_lookup_icq(ioc, q); } @@ -945,22 +949,19 @@ retry: spin_unlock_irq(q->queue_lock); /* allocate and init request */ - rq = mempool_alloc(q->rq.rq_pool, gfp_mask); + rq = mempool_alloc(rl->rq_pool, gfp_mask); if (!rq) goto fail_alloc; blk_rq_init(q, rq); + blk_rq_set_rl(rq, rl); rq->cmd_flags = rw_flags | REQ_ALLOCED; /* init elvpriv */ if (rw_flags & REQ_ELVPRIV) { if (unlikely(et->icq_cache && !icq)) { - create_io_context(gfp_mask, q->node); - ioc = rq_ioc(bio); - if (!ioc) - goto fail_elvpriv; - - icq = ioc_create_icq(ioc, q, gfp_mask); + if (ioc) + icq = ioc_create_icq(ioc, q, gfp_mask); if (!icq) goto fail_elvpriv; } @@ -1000,7 +1001,7 @@ fail_elvpriv: rq->elv.icq = NULL; spin_lock_irq(q->queue_lock); - rl->elvpriv--; + q->nr_rqs_elvpriv--; spin_unlock_irq(q->queue_lock); goto out; @@ -1013,7 +1014,7 @@ fail_alloc: * queue, but this is pretty rare. */ spin_lock_irq(q->queue_lock); - freed_request(q, rw_flags); + freed_request(rl, rw_flags); /* * in the very unlikely event that allocation failed and no @@ -1029,56 +1030,58 @@ rq_starved: } /** - * get_request_wait - get a free request with retry + * get_request - get a free request * @q: request_queue to allocate request from * @rw_flags: RW and SYNC flags * @bio: bio to allocate request for (can be %NULL) + * @gfp_mask: allocation mask * - * Get a free request from @q. This function keeps retrying under memory - * pressure and fails iff @q is dead. + * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this + * function keeps retrying under memory pressure and fails iff @q is dead. * * Must be callled with @q->queue_lock held and, * Returns %NULL on failure, with @q->queue_lock held. * Returns !%NULL on success, with @q->queue_lock *not held*. */ -static struct request *get_request_wait(struct request_queue *q, int rw_flags, - struct bio *bio) +static struct request *get_request(struct request_queue *q, int rw_flags, + struct bio *bio, gfp_t gfp_mask) { const bool is_sync = rw_is_sync(rw_flags) != 0; + DEFINE_WAIT(wait); + struct request_list *rl; struct request *rq; - rq = get_request(q, rw_flags, bio, GFP_NOIO); - while (!rq) { - DEFINE_WAIT(wait); - struct request_list *rl = &q->rq; + rl = blk_get_rl(q, bio); /* transferred to @rq on success */ +retry: + rq = __get_request(rl, rw_flags, bio, gfp_mask); + if (rq) + return rq; - if (unlikely(blk_queue_dead(q))) - return NULL; + if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q))) { + blk_put_rl(rl); + return NULL; + } - prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, - TASK_UNINTERRUPTIBLE); + /* wait on @rl and retry */ + prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, + TASK_UNINTERRUPTIBLE); - trace_block_sleeprq(q, bio, rw_flags & 1); + trace_block_sleeprq(q, bio, rw_flags & 1); - spin_unlock_irq(q->queue_lock); - io_schedule(); + spin_unlock_irq(q->queue_lock); + io_schedule(); - /* - * After sleeping, we become a "batching" process and - * will be able to allocate at least one request, and - * up to a big batch of them for a small period time. - * See ioc_batching, ioc_set_batching - */ - create_io_context(GFP_NOIO, q->node); - ioc_set_batching(q, current->io_context); + /* + * After sleeping, we become a "batching" process and will be able + * to allocate at least one request, and up to a big batch of them + * for a small period time. See ioc_batching, ioc_set_batching + */ + ioc_set_batching(q, current->io_context); - spin_lock_irq(q->queue_lock); - finish_wait(&rl->wait[is_sync], &wait); + spin_lock_irq(q->queue_lock); + finish_wait(&rl->wait[is_sync], &wait); - rq = get_request(q, rw_flags, bio, GFP_NOIO); - }; - - return rq; + goto retry; } struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) @@ -1087,11 +1090,11 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) BUG_ON(rw != READ && rw != WRITE); + /* create ioc upfront */ + create_io_context(gfp_mask, q->node); + spin_lock_irq(q->queue_lock); - if (gfp_mask & __GFP_WAIT) - rq = get_request_wait(q, rw, NULL); - else - rq = get_request(q, rw, NULL, gfp_mask); + rq = get_request(q, rw, NULL, gfp_mask); if (!rq) spin_unlock_irq(q->queue_lock); /* q->queue_lock is unlocked at this point */ @@ -1248,12 +1251,14 @@ void __blk_put_request(struct request_queue *q, struct request *req) */ if (req->cmd_flags & REQ_ALLOCED) { unsigned int flags = req->cmd_flags; + struct request_list *rl = blk_rq_rl(req); BUG_ON(!list_empty(&req->queuelist)); BUG_ON(!hlist_unhashed(&req->hash)); - blk_free_request(q, req); - freed_request(q, flags); + blk_free_request(rl, req); + freed_request(rl, flags); + blk_put_rl(rl); } } EXPORT_SYMBOL_GPL(__blk_put_request); @@ -1481,7 +1486,7 @@ get_rq: * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ - req = get_request_wait(q, rw_flags, bio); + req = get_request(q, rw_flags, bio, GFP_NOIO); if (unlikely(!req)) { bio_endio(bio, -ENODEV); /* @q is dead */ goto out_unlock; @@ -1702,6 +1707,14 @@ generic_make_request_checks(struct bio *bio) goto end_io; } + /* + * Various block parts want %current->io_context and lazy ioc + * allocation ends up trading a lot of pain for a small amount of + * memory. Just allocate it upfront. This may fail and block + * layer knows how to live with it. + */ + create_io_context(GFP_ATOMIC, q->node); + if (blk_throtl_bio(q, bio)) return false; /* throttled, will be resubmitted later */ diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 893b8007c65..fab4cdd3f7b 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -244,6 +244,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) /* initialize */ atomic_long_set(&ioc->refcount, 1); + atomic_set(&ioc->nr_tasks, 1); atomic_set(&ioc->active_ref, 1); spin_lock_init(&ioc->lock); INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); diff --git a/block/blk-settings.c b/block/blk-settings.c index d3234fc494a..565a6786032 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -143,8 +143,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->discard_zeroes_data = 1; lim->max_segments = USHRT_MAX; lim->max_hw_sectors = UINT_MAX; - - lim->max_sectors = BLK_DEF_MAX_SECTORS; + lim->max_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index aa41b47c22d..9628b291f96 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -40,7 +40,7 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page) static ssize_t queue_requests_store(struct request_queue *q, const char *page, size_t count) { - struct request_list *rl = &q->rq; + struct request_list *rl; unsigned long nr; int ret; @@ -55,6 +55,9 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) q->nr_requests = nr; blk_queue_congestion_threshold(q); + /* congestion isn't cgroup aware and follows root blkcg for now */ + rl = &q->root_rl; + if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) blk_set_queue_congested(q, BLK_RW_SYNC); else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) @@ -65,19 +68,22 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) blk_clear_queue_congested(q, BLK_RW_ASYNC); - if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { - blk_set_queue_full(q, BLK_RW_SYNC); - } else { - blk_clear_queue_full(q, BLK_RW_SYNC); - wake_up(&rl->wait[BLK_RW_SYNC]); + blk_queue_for_each_rl(rl, q) { + if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_SYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_SYNC); + wake_up(&rl->wait[BLK_RW_SYNC]); + } + + if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_ASYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_ASYNC); + wake_up(&rl->wait[BLK_RW_ASYNC]); + } } - if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { - blk_set_queue_full(q, BLK_RW_ASYNC); - } else { - blk_clear_queue_full(q, BLK_RW_ASYNC); - wake_up(&rl->wait[BLK_RW_ASYNC]); - } spin_unlock_irq(q->queue_lock); return ret; } @@ -476,7 +482,6 @@ static void blk_release_queue(struct kobject *kobj) { struct request_queue *q = container_of(kobj, struct request_queue, kobj); - struct request_list *rl = &q->rq; blk_sync_queue(q); @@ -489,8 +494,7 @@ static void blk_release_queue(struct kobject *kobj) elevator_exit(q->elevator); } - if (rl->rq_pool) - mempool_destroy(rl->rq_pool); + blk_exit_rl(&q->root_rl); if (q->queue_tags) __blk_queue_free_tags(q); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 5b065951204..e287c19908c 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1123,9 +1123,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) goto out; } - /* bio_associate_current() needs ioc, try creating */ - create_io_context(GFP_ATOMIC, q->node); - /* * A throtl_grp pointer retrieved under rcu can be used to access * basic fields like stats and io rates. If a group has no rules, diff --git a/block/blk.h b/block/blk.h index 85f6ae42f7d..2a0ea32d249 100644 --- a/block/blk.h +++ b/block/blk.h @@ -18,6 +18,9 @@ static inline void __blk_get_queue(struct request_queue *q) kobject_get(&q->kobj); } +int blk_init_rl(struct request_list *rl, struct request_queue *q, + gfp_t gfp_mask); +void blk_exit_rl(struct request_list *rl); void init_request_from_bio(struct request *req, struct bio *bio); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); @@ -33,7 +36,6 @@ bool __blk_end_bidi_request(struct request *rq, int error, void blk_rq_timed_out_timer(unsigned long data); void blk_delete_timer(struct request *); void blk_add_timer(struct request *); -void __generic_unplug_device(struct request_queue *); /* * Internal atomic flags for request handling diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 7ad49c88f6b..deee61fbb74 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -243,56 +243,3 @@ int bsg_setup_queue(struct device *dev, struct request_queue *q, return 0; } EXPORT_SYMBOL_GPL(bsg_setup_queue); - -/** - * bsg_remove_queue - Deletes the bsg dev from the q - * @q: the request_queue that is to be torn down. - * - * Notes: - * Before unregistering the queue empty any requests that are blocked - */ -void bsg_remove_queue(struct request_queue *q) -{ - struct request *req; /* block request */ - int counts; /* totals for request_list count and starved */ - - if (!q) - return; - - /* Stop taking in new requests */ - spin_lock_irq(q->queue_lock); - blk_stop_queue(q); - - /* drain all requests in the queue */ - while (1) { - /* need the lock to fetch a request - * this may fetch the same reqeust as the previous pass - */ - req = blk_fetch_request(q); - /* save requests in use and starved */ - counts = q->rq.count[0] + q->rq.count[1] + - q->rq.starved[0] + q->rq.starved[1]; - spin_unlock_irq(q->queue_lock); - /* any requests still outstanding? */ - if (counts == 0) - break; - - /* This may be the same req as the previous iteration, - * always send the blk_end_request_all after a prefetch. - * It is not okay to not end the request because the - * prefetch started the request. - */ - if (req) { - /* return -ENXIO to indicate that this queue is - * going away - */ - req->errors = -ENXIO; - blk_end_request_all(req, -ENXIO); - } - - msleep(200); /* allow bsg to possibly finish */ - spin_lock_irq(q->queue_lock); - } - bsg_unregister_queue(q); -} -EXPORT_SYMBOL_GPL(bsg_remove_queue); diff --git a/block/genhd.c b/block/genhd.c index 9cf5583c90f..cac7366957c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -154,7 +154,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) part = rcu_dereference(ptbl->part[piter->idx]); if (!part) continue; - if (!part->nr_sects && + if (!part_nr_sects_read(part) && !(piter->flags & DISK_PITER_INCL_EMPTY) && !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && piter->idx == 0)) @@ -191,7 +191,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit); static inline int sector_in_part(struct hd_struct *part, sector_t sector) { return part->start_sect <= sector && - sector < part->start_sect + part->nr_sects; + sector < part->start_sect + part_nr_sects_read(part); } /** @@ -769,8 +769,8 @@ void __init printk_all_partitions(void) printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), - (unsigned long long)part->nr_sects >> 1, - disk_name(disk, part->partno, name_buf), + (unsigned long long)part_nr_sects_read(part) >> 1 + , disk_name(disk, part->partno, name_buf), uuid_buf); if (is_part0) { if (disk->driverfs_dev != NULL && @@ -862,7 +862,7 @@ static int show_partition(struct seq_file *seqf, void *v) while ((part = disk_part_iter_next(&piter))) seq_printf(seqf, "%4d %7d %10llu %s\n", MAJOR(part_devt(part)), MINOR(part_devt(part)), - (unsigned long long)part->nr_sects >> 1, + (unsigned long long)part_nr_sects_read(part) >> 1, disk_name(sgp, part->partno, buf)); disk_part_iter_exit(&piter); @@ -1268,6 +1268,16 @@ struct gendisk *alloc_disk_node(int minors, int node_id) } disk->part_tbl->part[0] = &disk->part0; + /* + * set_capacity() and get_capacity() currently don't use + * seqcounter to read/update the part0->nr_sects. Still init + * the counter as we can read the sectors in IO submission + * patch using seqence counters. + * + * TODO: Ideally set_capacity() and get_capacity() should be + * converted to make use of bd_mutex and sequence counters. + */ + seqcount_init(&disk->part0.nr_sects_seq); hd_ref_init(&disk->part0); disk->minors = minors; diff --git a/block/ioctl.c b/block/ioctl.c index ba15b2dbfb9..4476e0e85d1 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -13,7 +13,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user { struct block_device *bdevp; struct gendisk *disk; - struct hd_struct *part; + struct hd_struct *part, *lpart; struct blkpg_ioctl_arg a; struct blkpg_partition p; struct disk_part_iter piter; @@ -36,8 +36,8 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user case BLKPG_ADD_PARTITION: start = p.start >> 9; length = p.length >> 9; - /* check for fit in a hd_struct */ - if (sizeof(sector_t) == sizeof(long) && + /* check for fit in a hd_struct */ + if (sizeof(sector_t) == sizeof(long) && sizeof(long long) > sizeof(long)) { long pstart = start, plength = length; if (pstart != start || plength != length @@ -91,6 +91,59 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user mutex_unlock(&bdevp->bd_mutex); bdput(bdevp); + return 0; + case BLKPG_RESIZE_PARTITION: + start = p.start >> 9; + /* new length of partition in bytes */ + length = p.length >> 9; + /* check for fit in a hd_struct */ + if (sizeof(sector_t) == sizeof(long) && + sizeof(long long) > sizeof(long)) { + long pstart = start, plength = length; + if (pstart != start || plength != length + || pstart < 0 || plength < 0) + return -EINVAL; + } + part = disk_get_part(disk, partno); + if (!part) + return -ENXIO; + bdevp = bdget(part_devt(part)); + if (!bdevp) { + disk_put_part(part); + return -ENOMEM; + } + mutex_lock(&bdevp->bd_mutex); + mutex_lock_nested(&bdev->bd_mutex, 1); + if (start != part->start_sect) { + mutex_unlock(&bdevp->bd_mutex); + mutex_unlock(&bdev->bd_mutex); + bdput(bdevp); + disk_put_part(part); + return -EINVAL; + } + /* overlap? */ + disk_part_iter_init(&piter, disk, + DISK_PITER_INCL_EMPTY); + while ((lpart = disk_part_iter_next(&piter))) { + if (lpart->partno != partno && + !(start + length <= lpart->start_sect || + start >= lpart->start_sect + lpart->nr_sects) + ) { + disk_part_iter_exit(&piter); + mutex_unlock(&bdevp->bd_mutex); + mutex_unlock(&bdev->bd_mutex); + bdput(bdevp); + disk_put_part(part); + return -EBUSY; + } + } + disk_part_iter_exit(&piter); + part_nr_sects_write(part, (sector_t)length); + i_size_write(bdevp->bd_inode, p.length); + mutex_unlock(&bdevp->bd_mutex); + mutex_unlock(&bdev->bd_mutex); + bdput(bdevp); + disk_put_part(part); return 0; default: return -EINVAL; diff --git a/block/partition-generic.c b/block/partition-generic.c index 6df5d6928a4..f1d14519cc0 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -84,7 +84,7 @@ ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); + return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p)); } static ssize_t part_ro_show(struct device *dev, @@ -294,6 +294,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, err = -ENOMEM; goto out_free; } + + seqcount_init(&p->nr_sects_seq); pdev = part_to_dev(p); p->start_sect = start; diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c index 2d1e68db9b3..e894ca7b54c 100644 --- a/drivers/scsi/scsi_transport_fc.c +++ b/drivers/scsi/scsi_transport_fc.c @@ -4146,45 +4146,7 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport) static void fc_bsg_remove(struct request_queue *q) { - struct request *req; /* block request */ - int counts; /* totals for request_list count and starved */ - if (q) { - /* Stop taking in new requests */ - spin_lock_irq(q->queue_lock); - blk_stop_queue(q); - - /* drain all requests in the queue */ - while (1) { - /* need the lock to fetch a request - * this may fetch the same reqeust as the previous pass - */ - req = blk_fetch_request(q); - /* save requests in use and starved */ - counts = q->rq.count[0] + q->rq.count[1] + - q->rq.starved[0] + q->rq.starved[1]; - spin_unlock_irq(q->queue_lock); - /* any requests still outstanding? */ - if (counts == 0) - break; - - /* This may be the same req as the previous iteration, - * always send the blk_end_request_all after a prefetch. - * It is not okay to not end the request because the - * prefetch started the request. - */ - if (req) { - /* return -ENXIO to indicate that this queue is - * going away - */ - req->errors = -ENXIO; - blk_end_request_all(req, -ENXIO); - } - - msleep(200); /* allow bsg to possibly finish */ - spin_lock_irq(q->queue_lock); - } - bsg_unregister_queue(q); blk_cleanup_queue(q); } diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 09809d06ecc..fa1dfaa83e3 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -575,7 +575,7 @@ static int iscsi_remove_host(struct transport_container *tc, struct iscsi_cls_host *ihost = shost->shost_data; if (ihost->bsg_q) { - bsg_remove_queue(ihost->bsg_q); + bsg_unregister_queue(ihost->bsg_q); blk_cleanup_queue(ihost->bsg_q); } return 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 07954b05b86..3816ce8a08f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -46,16 +46,23 @@ struct blkcg_gq; struct request; typedef void (rq_end_io_fn)(struct request *, int); +#define BLK_RL_SYNCFULL (1U << 0) +#define BLK_RL_ASYNCFULL (1U << 1) + struct request_list { + struct request_queue *q; /* the queue this rl belongs to */ +#ifdef CONFIG_BLK_CGROUP + struct blkcg_gq *blkg; /* blkg this request pool belongs to */ +#endif /* * count[], starved[], and wait[] are indexed by * BLK_RW_SYNC/BLK_RW_ASYNC */ - int count[2]; - int starved[2]; - int elvpriv; - mempool_t *rq_pool; - wait_queue_head_t wait[2]; + int count[2]; + int starved[2]; + mempool_t *rq_pool; + wait_queue_head_t wait[2]; + unsigned int flags; }; /* @@ -138,6 +145,7 @@ struct request { struct hd_struct *part; unsigned long start_time; #ifdef CONFIG_BLK_CGROUP + struct request_list *rl; /* rl this rq is alloced from */ unsigned long long start_time_ns; unsigned long long io_start_time_ns; /* when passed to hardware */ #endif @@ -282,11 +290,16 @@ struct request_queue { struct list_head queue_head; struct request *last_merge; struct elevator_queue *elevator; + int nr_rqs[2]; /* # allocated [a]sync rqs */ + int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ /* - * the queue request freelist, one for reads and one for writes + * If blkcg is not used, @q->root_rl serves all requests. If blkcg + * is used, root blkg allocates from @q->root_rl and all other + * blkgs from their own blkg->rl. Which one to use should be + * determined using bio_request_list(). */ - struct request_list rq; + struct request_list root_rl; request_fn_proc *request_fn; make_request_fn *make_request_fn; @@ -561,27 +574,25 @@ static inline bool rq_is_sync(struct request *rq) return rw_is_sync(rq->cmd_flags); } -static inline int blk_queue_full(struct request_queue *q, int sync) +static inline bool blk_rl_full(struct request_list *rl, bool sync) { - if (sync) - return test_bit(QUEUE_FLAG_SYNCFULL, &q->queue_flags); - return test_bit(QUEUE_FLAG_ASYNCFULL, &q->queue_flags); + unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL; + + return rl->flags & flag; } -static inline void blk_set_queue_full(struct request_queue *q, int sync) +static inline void blk_set_rl_full(struct request_list *rl, bool sync) { - if (sync) - queue_flag_set(QUEUE_FLAG_SYNCFULL, q); - else - queue_flag_set(QUEUE_FLAG_ASYNCFULL, q); + unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL; + + rl->flags |= flag; } -static inline void blk_clear_queue_full(struct request_queue *q, int sync) +static inline void blk_clear_rl_full(struct request_list *rl, bool sync) { - if (sync) - queue_flag_clear(QUEUE_FLAG_SYNCFULL, q); - else - queue_flag_clear(QUEUE_FLAG_ASYNCFULL, q); + unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL; + + rl->flags &= ~flag; } diff --git a/include/linux/blkpg.h b/include/linux/blkpg.h index faf8a45af21..a8519446c11 100644 --- a/include/linux/blkpg.h +++ b/include/linux/blkpg.h @@ -40,6 +40,7 @@ struct blkpg_ioctl_arg { /* The subfunctions (for the op field) */ #define BLKPG_ADD_PARTITION 1 #define BLKPG_DEL_PARTITION 2 +#define BLKPG_RESIZE_PARTITION 3 /* Sizes of name fields. Unused at present. */ #define BLKPG_DEVNAMELTH 64 diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h index f55ab8cdc10..4d0fb3df2f4 100644 --- a/include/linux/bsg-lib.h +++ b/include/linux/bsg-lib.h @@ -67,7 +67,6 @@ void bsg_job_done(struct bsg_job *job, int result, int bsg_setup_queue(struct device *dev, struct request_queue *q, char *name, bsg_job_fn *job_fn, int dd_job_size); void bsg_request_fn(struct request_queue *q); -void bsg_remove_queue(struct request_queue *q); void bsg_goose_queue(struct request_queue *q); #endif diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ae0aaa9d42f..4f440b3e89f 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -97,7 +97,13 @@ struct partition_meta_info { struct hd_struct { sector_t start_sect; + /* + * nr_sects is protected by sequence counter. One might extend a + * partition while IO is happening to it and update of nr_sects + * can be non-atomic on 32bit machines with 64bit sector_t. + */ sector_t nr_sects; + seqcount_t nr_sects_seq; sector_t alignment_offset; unsigned int discard_alignment; struct device __dev; @@ -647,6 +653,57 @@ static inline void hd_struct_put(struct hd_struct *part) __delete_partition(part); } +/* + * Any access of part->nr_sects which is not protected by partition + * bd_mutex or gendisk bdev bd_mutex, should be done using this + * accessor function. + * + * Code written along the lines of i_size_read() and i_size_write(). + * CONFIG_PREEMPT case optimizes the case of UP kernel with preemption + * on. + */ +static inline sector_t part_nr_sects_read(struct hd_struct *part) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP) + sector_t nr_sects; + unsigned seq; + do { + seq = read_seqcount_begin(&part->nr_sects_seq); + nr_sects = part->nr_sects; + } while (read_seqcount_retry(&part->nr_sects_seq, seq)); + return nr_sects; +#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT) + sector_t nr_sects; + + preempt_disable(); + nr_sects = part->nr_sects; + preempt_enable(); + return nr_sects; +#else + return part->nr_sects; +#endif +} + +/* + * Should be called with mutex lock held (typically bd_mutex) of partition + * to provide mutual exlusion among writers otherwise seqcount might be + * left in wrong state leaving the readers spinning infinitely. + */ +static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP) + write_seqcount_begin(&part->nr_sects_seq); + part->nr_sects = size; + write_seqcount_end(&part->nr_sects_seq); +#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT) + preempt_disable(); + part->nr_sects = size; + preempt_enable(); +#else + part->nr_sects = size; +#endif +} + #else /* CONFIG_BLOCK */ static inline void printk_all_partitions(void) { } diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 7c08052e332..39ed62ab5b8 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -26,7 +26,8 @@ typedef struct mempool_s { extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data); extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, int nid); + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int nid); extern int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask); extern void mempool_destroy(mempool_t *pool); diff --git a/mm/mempool.c b/mm/mempool.c index d9049811f35..54990476c04 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -63,19 +63,21 @@ EXPORT_SYMBOL(mempool_destroy); mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data) { - return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1); + return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data, + GFP_KERNEL, NUMA_NO_NODE); } EXPORT_SYMBOL(mempool_create); mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, int node_id) + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int node_id) { mempool_t *pool; - pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id); + pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); if (!pool) return NULL; pool->elements = kmalloc_node(min_nr * sizeof(void *), - GFP_KERNEL, node_id); + gfp_mask, node_id); if (!pool->elements) { kfree(pool); return NULL; @@ -93,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, while (pool->curr_nr < pool->min_nr) { void *element; - element = pool->alloc(GFP_KERNEL, pool->pool_data); + element = pool->alloc(gfp_mask, pool->pool_data); if (unlikely(!element)) { mempool_destroy(pool); return NULL;