Browse Source

Merge branch 'blk-mq/core' into for-3.13/core

Signed-off-by: Jens Axboe <axboe@kernel.dk>

Conflicts:
	block/blk-timeout.c
master
Jens Axboe 8 years ago
parent
commit
e37459b8e2
  1. 5
      block/Makefile
  2. 157
      block/blk-core.c
  3. 14
      block/blk-exec.c
  4. 154
      block/blk-flush.c
  5. 17
      block/blk-merge.c
  6. 93
      block/blk-mq-cpu.c
  7. 108
      block/blk-mq-cpumap.c
  8. 384
      block/blk-mq-sysfs.c
  9. 204
      block/blk-mq-tag.c
  10. 27
      block/blk-mq-tag.h
  11. 1500
      block/blk-mq.c
  12. 52
      block/blk-mq.h
  13. 13
      block/blk-sysfs.c
  14. 74
      block/blk-timeout.c
  15. 17
      block/blk.h
  16. 3
      drivers/block/Kconfig
  17. 1
      drivers/block/Makefile
  18. 4
      drivers/block/floppy.c
  19. 635
      drivers/block/null_blk.c
  20. 2
      drivers/scsi/sd.c
  21. 2
      include/linux/bio.h
  22. 183
      include/linux/blk-mq.h
  23. 68
      include/linux/blk_types.h
  24. 60
      include/linux/blkdev.h
  25. 23
      include/linux/percpu_ida.h
  26. 7
      kernel/smp.c
  27. 15
      lib/percpu_counter.c
  28. 89
      lib/percpu_ida.c

5
block/Makefile

@ -5,8 +5,9 @@
obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \
partition-generic.o partitions/
blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o partitions/
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o

157
block/blk-core.c

@ -16,6 +16,7 @@
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/kernel_stat.h>
@ -48,7 +49,7 @@ DEFINE_IDA(blk_queue_ida);
/*
* For the allocated request tables
*/
static struct kmem_cache *request_cachep;
struct kmem_cache *request_cachep = NULL;
/*
* For queue allocation
@ -60,42 +61,6 @@ struct kmem_cache *blk_requestq_cachep;
*/
static struct workqueue_struct *kblockd_workqueue;
static void drive_stat_acct(struct request *rq, int new_io)
{
struct hd_struct *part;
int rw = rq_data_dir(rq);
int cpu;
if (!blk_do_io_stat(rq))
return;
cpu = part_stat_lock();
if (!new_io) {
part = rq->part;
part_stat_inc(cpu, part, merges[rw]);
} else {
part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
if (!hd_struct_try_get(part)) {
/*
* The partition is already being removed,
* the request will be accounted on the disk only
*
* We take a reference on disk->part0 although that
* partition will never be deleted, so we can treat
* it as any other partition.
*/
part = &rq->rq_disk->part0;
hd_struct_get(part);
}
part_round_stats(cpu, part);
part_inc_in_flight(part, rw);
rq->part = part;
}
part_stat_unlock();
}
void blk_queue_congestion_threshold(struct request_queue *q)
{
int nr;
@ -145,7 +110,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->cmd = rq->__cmd;
rq->cmd_len = BLK_MAX_CDB;
rq->tag = -1;
rq->ref_count = 1;
rq->start_time = jiffies;
set_start_time_ns(rq);
rq->part = NULL;
@ -174,9 +138,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
{
int bit;
printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,
printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg,
rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
rq->cmd_flags);
(unsigned long long) rq->cmd_flags);
printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
(unsigned long long)blk_rq_pos(rq),
@ -595,9 +559,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (!q)
return NULL;
if (percpu_counter_init(&q->mq_usage_counter, 0))
goto fail_q;
q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
if (q->id < 0)
goto fail_q;
goto fail_c;
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@ -644,6 +611,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
q->bypass_depth = 1;
__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
init_waitqueue_head(&q->mq_freeze_wq);
if (blkcg_init_queue(q))
goto fail_bdi;
@ -653,6 +622,8 @@ fail_bdi:
bdi_destroy(&q->backing_dev_info);
fail_id:
ida_simple_remove(&blk_queue_ida, q->id);
fail_c:
percpu_counter_destroy(&q->mq_usage_counter);
fail_q:
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
@ -1119,7 +1090,8 @@ retry:
goto retry;
}
struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
static struct request *blk_old_get_request(struct request_queue *q, int rw,
gfp_t gfp_mask)
{
struct request *rq;
@ -1136,6 +1108,14 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
return rq;
}
struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
{
if (q->mq_ops)
return blk_mq_alloc_request(q, rw, gfp_mask, false);
else
return blk_old_get_request(q, rw, gfp_mask);
}
EXPORT_SYMBOL(blk_get_request);
/**
@ -1221,7 +1201,7 @@ EXPORT_SYMBOL(blk_requeue_request);
static void add_acct_request(struct request_queue *q, struct request *rq,
int where)
{
drive_stat_acct(rq, 1);
blk_account_io_start(rq, true);
__elv_add_request(q, rq, where);
}
@ -1282,8 +1262,6 @@ void __blk_put_request(struct request_queue *q, struct request *req)
{
if (unlikely(!q))
return;
if (unlikely(--req->ref_count))
return;
blk_pm_put_request(req);
@ -1312,12 +1290,17 @@ EXPORT_SYMBOL_GPL(__blk_put_request);
void blk_put_request(struct request *req)
{
unsigned long flags;
struct request_queue *q = req->q;
spin_lock_irqsave(q->queue_lock, flags);
__blk_put_request(q, req);
spin_unlock_irqrestore(q->queue_lock, flags);
if (q->mq_ops)
blk_mq_free_request(req);
else {
unsigned long flags;
spin_lock_irqsave(q->queue_lock, flags);
__blk_put_request(q, req);
spin_unlock_irqrestore(q->queue_lock, flags);
}
}
EXPORT_SYMBOL(blk_put_request);
@ -1353,8 +1336,8 @@ void blk_add_request_payload(struct request *rq, struct page *page,
}
EXPORT_SYMBOL_GPL(blk_add_request_payload);
static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
struct bio *bio)
bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
struct bio *bio)
{
const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
@ -1371,12 +1354,12 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
req->__data_len += bio->bi_size;
req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
drive_stat_acct(req, 0);
blk_account_io_start(req, false);
return true;
}
static bool bio_attempt_front_merge(struct request_queue *q,
struct request *req, struct bio *bio)
bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
struct bio *bio)
{
const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
@ -1401,12 +1384,12 @@ static bool bio_attempt_front_merge(struct request_queue *q,
req->__data_len += bio->bi_size;
req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
drive_stat_acct(req, 0);
blk_account_io_start(req, false);
return true;
}
/**
* attempt_plug_merge - try to merge with %current's plugged list
* blk_attempt_plug_merge - try to merge with %current's plugged list
* @q: request_queue new bio is being queued at
* @bio: new bio being queued
* @request_count: out parameter for number of traversed plugged requests
@ -1422,12 +1405,13 @@ static bool bio_attempt_front_merge(struct request_queue *q,
* reliable access to the elevator outside queue lock. Only check basic
* merging parameters without querying the elevator.
*/
static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int *request_count)
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int *request_count)
{
struct blk_plug *plug;
struct request *rq;
bool ret = false;
struct list_head *plug_list;
if (blk_queue_nomerges(q))
goto out;
@ -1437,7 +1421,12 @@ static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
goto out;
*request_count = 0;
list_for_each_entry_reverse(rq, &plug->list, queuelist) {
if (q->mq_ops)
plug_list = &plug->mq_list;
else
plug_list = &plug->list;
list_for_each_entry_reverse(rq, plug_list, queuelist) {
int el_ret;
if (rq->q == q)
@ -1505,7 +1494,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
* Check if we can merge with the plugged list before grabbing
* any locks.
*/
if (attempt_plug_merge(q, bio, &request_count))
if (blk_attempt_plug_merge(q, bio, &request_count))
return;
spin_lock_irq(q->queue_lock);
@ -1573,7 +1562,7 @@ get_rq:
}
}
list_add_tail(&req->queuelist, &plug->list);
drive_stat_acct(req, 1);
blk_account_io_start(req, true);
} else {
spin_lock_irq(q->queue_lock);
add_acct_request(q, req, where);
@ -2027,7 +2016,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
static void blk_account_io_completion(struct request *req, unsigned int bytes)
void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (blk_do_io_stat(req)) {
const int rw = rq_data_dir(req);
@ -2041,7 +2030,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
}
}
static void blk_account_io_done(struct request *req)
void blk_account_io_done(struct request *req)
{
/*
* Account IO completion. flush_rq isn't accounted as a
@ -2089,6 +2078,42 @@ static inline struct request *blk_pm_peek_request(struct request_queue *q,
}
#endif
void blk_account_io_start(struct request *rq, bool new_io)
{
struct hd_struct *part;
int rw = rq_data_dir(rq);
int cpu;
if (!blk_do_io_stat(rq))
return;
cpu = part_stat_lock();
if (!new_io) {
part = rq->part;
part_stat_inc(cpu, part, merges[rw]);
} else {
part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
if (!hd_struct_try_get(part)) {
/*
* The partition is already being removed,
* the request will be accounted on the disk only
*
* We take a reference on disk->part0 although that
* partition will never be deleted, so we can treat
* it as any other partition.
*/
part = &rq->rq_disk->part0;
hd_struct_get(part);
}
part_round_stats(cpu, part);
part_inc_in_flight(part, rw);
rq->part = part;
}
part_stat_unlock();
}
/**
* blk_peek_request - peek at the top of a request queue
* @q: request queue to peek at
@ -2465,7 +2490,6 @@ static void blk_finish_request(struct request *req, int error)
if (req->cmd_flags & REQ_DONTPREP)
blk_unprep_request(req);
blk_account_io_done(req);
if (req->end_io)
@ -2887,6 +2911,7 @@ void blk_start_plug(struct blk_plug *plug)
plug->magic = PLUG_MAGIC;
INIT_LIST_HEAD(&plug->list);
INIT_LIST_HEAD(&plug->mq_list);
INIT_LIST_HEAD(&plug->cb_list);
/*
@ -2984,6 +3009,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
BUG_ON(plug->magic != PLUG_MAGIC);
flush_plug_callbacks(plug, from_schedule);
if (!list_empty(&plug->mq_list))
blk_mq_flush_plug_list(plug, from_schedule);
if (list_empty(&plug->list))
return;

14
block/blk-exec.c

@ -5,6 +5,7 @@
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/sched/sysctl.h>
#include "blk.h"
@ -24,7 +25,6 @@ static void blk_end_sync_rq(struct request *rq, int error)
struct completion *waiting = rq->end_io_data;
rq->end_io_data = NULL;
__blk_put_request(rq->q, rq);
/*
* complete last, if this is a stack request the process (and thus
@ -59,6 +59,12 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
rq->rq_disk = bd_disk;
rq->end_io = done;
if (q->mq_ops) {
blk_mq_insert_request(q, rq, true);
return;
}
/*
* need to check this before __blk_run_queue(), because rq can
* be freed before that returns.
@ -103,12 +109,6 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
int err = 0;
unsigned long hang_check;
/*
* we need an extra reference to the request, so we can look at
* it after io completion
*/
rq->ref_count++;
if (!rq->sense) {
memset(sense, 0, sizeof(sense));
rq->sense = sense;

154
block/blk-flush.c

@ -69,8 +69,10 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/gfp.h>
#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
/* FLUSH/FUA sequences */
enum {
@ -124,6 +126,24 @@ static void blk_flush_restore_request(struct request *rq)
/* make @rq a normal request */
rq->cmd_flags &= ~REQ_FLUSH_SEQ;
rq->end_io = rq->flush.saved_end_io;
blk_clear_rq_complete(rq);
}
static void mq_flush_data_run(struct work_struct *work)
{
struct request *rq;
rq = container_of(work, struct request, mq_flush_data);
memset(&rq->csd, 0, sizeof(rq->csd));
blk_mq_run_request(rq, true, false);
}
static void blk_mq_flush_data_insert(struct request *rq)
{
INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);
kblockd_schedule_work(rq->q, &rq->mq_flush_data);
}
/**
@ -136,7 +156,7 @@ static void blk_flush_restore_request(struct request *rq)
* completion and trigger the next step.
*
* CONTEXT:
* spin_lock_irq(q->queue_lock)
* spin_lock_irq(q->queue_lock or q->mq_flush_lock)
*
* RETURNS:
* %true if requests were added to the dispatch queue, %false otherwise.
@ -146,7 +166,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
{
struct request_queue *q = rq->q;
struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
bool queued = false;
bool queued = false, kicked;
BUG_ON(rq->flush.seq & seq);
rq->flush.seq |= seq;
@ -167,8 +187,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
case REQ_FSEQ_DATA:
list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
list_add(&rq->queuelist, &q->queue_head);
queued = true;
if (q->mq_ops)
blk_mq_flush_data_insert(rq);
else {
list_add(&rq->queuelist, &q->queue_head);
queued = true;
}
break;
case REQ_FSEQ_DONE:
@ -181,28 +205,43 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
BUG_ON(!list_empty(&rq->queuelist));
list_del_init(&rq->flush.list);
blk_flush_restore_request(rq);
__blk_end_request_all(rq, error);
if (q->mq_ops)
blk_mq_end_io(rq, error);
else
__blk_end_request_all(rq, error);
break;
default:
BUG();
}
return blk_kick_flush(q) | queued;
kicked = blk_kick_flush(q);
/* blk_mq_run_flush will run queue */
if (q->mq_ops)
return queued;
return kicked | queued;
}
static void flush_end_io(struct request *flush_rq, int error)
{
struct request_queue *q = flush_rq->q;
struct list_head *running = &q->flush_queue[q->flush_running_idx];
struct list_head *running;
bool queued = false;
struct request *rq, *n;
unsigned long flags = 0;
if (q->mq_ops) {
blk_mq_free_request(flush_rq);
spin_lock_irqsave(&q->mq_flush_lock, flags);
}
running = &q->flush_queue[q->flush_running_idx];
BUG_ON(q->flush_pending_idx == q->flush_running_idx);
/* account completion of the flush request */
q->flush_running_idx ^= 1;
elv_completed_request(q, flush_rq);
if (!q->mq_ops)
elv_completed_request(q, flush_rq);
/* and push the waiting requests to the next stage */
list_for_each_entry_safe(rq, n, running, flush.list) {
@ -223,9 +262,48 @@ static void flush_end_io(struct request *flush_rq, int error)
* directly into request_fn may confuse the driver. Always use
* kblockd.
*/
if (queued || q->flush_queue_delayed)
blk_run_queue_async(q);
if (queued || q->flush_queue_delayed) {
if (!q->mq_ops)
blk_run_queue_async(q);
else
/*
* This can be optimized to only run queues with requests
* queued if necessary.
*/
blk_mq_run_queues(q, true);
}
q->flush_queue_delayed = 0;
if (q->mq_ops)
spin_unlock_irqrestore(&q->mq_flush_lock, flags);
}
static void mq_flush_work(struct work_struct *work)
{
struct request_queue *q;
struct request *rq;
q = container_of(work, struct request_queue, mq_flush_work);
/* We don't need set REQ_FLUSH_SEQ, it's for consistency */
rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
__GFP_WAIT|GFP_ATOMIC, true);
rq->cmd_type = REQ_TYPE_FS;
rq->end_io = flush_end_io;
blk_mq_run_request(rq, true, false);
}
/*
* We can't directly use q->flush_rq, because it doesn't have tag and is not in
* hctx->rqs[]. so we must allocate a new request, since we can't sleep here,
* so offload the work to workqueue.
*
* Note: we assume a flush request finished in any hardware queue will flush
* the whole disk cache.
*/
static void mq_run_flush(struct request_queue *q)
{
kblockd_schedule_work(q, &q->mq_flush_work);
}
/**
@ -236,7 +314,7 @@ static void flush_end_io(struct request *flush_rq, int error)
* Please read the comment at the top of this file for more info.
*
* CONTEXT:
* spin_lock_irq(q->queue_lock)
* spin_lock_irq(q->queue_lock or q->mq_flush_lock)
*
* RETURNS:
* %true if flush was issued, %false otherwise.
@ -261,13 +339,18 @@ static bool blk_kick_flush(struct request_queue *q)
* Issue flush and toggle pending_idx. This makes pending_idx
* different from running_idx, which means flush is in flight.
*/
q->flush_pending_idx ^= 1;
if (q->mq_ops) {
mq_run_flush(q);
return true;
}
blk_rq_init(q, &q->flush_rq);
q->flush_rq.cmd_type = REQ_TYPE_FS;
q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
q->flush_rq.rq_disk = first_rq->rq_disk;
q->flush_rq.end_io = flush_end_io;
q->flush_pending_idx ^= 1;
list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
return true;
}
@ -284,16 +367,37 @@ static void flush_data_end_io(struct request *rq, int error)
blk_run_queue_async(q);
}
static void mq_flush_data_end_io(struct request *rq, int error)
{
struct request_queue *q = rq->q;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
unsigned long flags;
ctx = rq->mq_ctx;
hctx = q->mq_ops->map_queue(q, ctx->cpu);
/*
* After populating an empty queue, kick it to avoid stall. Read
* the comment in flush_end_io().
*/
spin_lock_irqsave(&q->mq_flush_lock, flags);
if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
blk_mq_run_hw_queue(hctx, true);
spin_unlock_irqrestore(&q->mq_flush_lock, flags);
}
/**
* blk_insert_flush - insert a new FLUSH/FUA request
* @rq: request to insert
*
* To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
* or __blk_mq_run_hw_queue() to dispatch request.
* @rq is being submitted. Analyze what needs to be done and put it on the
* right queue.
*
* CONTEXT:
* spin_lock_irq(q->queue_lock)
* spin_lock_irq(q->queue_lock) in !mq case
*/
void blk_insert_flush(struct request *rq)
{
@ -316,7 +420,10 @@ void blk_insert_flush(struct request *rq)
* complete the request.
*/
if (!policy) {
__blk_end_bidi_request(rq, 0, 0, 0);
if (q->mq_ops)
blk_mq_end_io(rq, 0);
else
__blk_end_bidi_request(rq, 0, 0, 0);
return;
}
@ -329,7 +436,10 @@ void blk_insert_flush(struct request *rq)
*/
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
list_add_tail(&rq->queuelist, &q->queue_head);
if (q->mq_ops) {
blk_mq_run_request(rq, false, true);
} else
list_add_tail(&rq->queuelist, &q->queue_head);
return;
}
@ -341,6 +451,14 @@ void blk_insert_flush(struct request *rq)
INIT_LIST_HEAD(&rq->flush.list);
rq->cmd_flags |= REQ_FLUSH_SEQ;
rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
if (q->mq_ops) {
rq->end_io = mq_flush_data_end_io;
spin_lock_irq(&q->mq_flush_lock);
blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
spin_unlock_irq(&q->mq_flush_lock);
return;
}
rq->end_io = flush_data_end_io;
blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
@ -453,3 +571,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
return ret;
}
EXPORT_SYMBOL(blkdev_issue_flush);
void blk_mq_init_flush(struct request_queue *q)
{
spin_lock_init(&q->mq_flush_lock);
INIT_WORK(&q->mq_flush_work, mq_flush_work);
}

17
block/blk-merge.c

@ -308,6 +308,17 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
return ll_new_hw_segment(q, req, bio);
}
/*
* blk-mq uses req->special to carry normal driver per-request payload, it
* does not indicate a prepared command that we cannot merge with.
*/
static bool req_no_special_merge(struct request *req)
{
struct request_queue *q = req->q;
return !q->mq_ops && req->special;
}
static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
struct request *next)
{
@ -319,7 +330,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
* First check if the either of the requests are re-queued
* requests. Can't merge them if they are.
*/
if (req->special || next->special)
if (req_no_special_merge(req) || req_no_special_merge(next))
return 0;
/*
@ -416,7 +427,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
if (rq_data_dir(req) != rq_data_dir(next)
|| req->rq_disk != next->rq_disk
|| next->special)
|| req_no_special_merge(next))
return 0;
if (req->cmd_flags & REQ_WRITE_SAME &&
@ -515,7 +526,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
return false;
/* must be same device and not a special request */
if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq))
return false;
/* only merge integrity protected bio into ditto rq */

93
block/blk-mq-cpu.c

@ -0,0 +1,93 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/list.h>
#include <linux/llist.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/blk-mq.h>
#include "blk-mq.h"
static LIST_HEAD(blk_mq_cpu_notify_list);
static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long) hcpu;
struct blk_mq_cpu_notifier *notify;
spin_lock(&blk_mq_cpu_notify_lock);
list_for_each_entry(notify, &blk_mq_cpu_notify_list, list)
notify->notify(notify->data, action, cpu);
spin_unlock(&blk_mq_cpu_notify_lock);
return NOTIFY_OK;
}
static void __cpuinit blk_mq_cpu_notify(void *data, unsigned long action,
unsigned int cpu)
{
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
/*
* If the CPU goes away, ensure that we run any pending
* completions.
*/
struct llist_node *node;
struct request *rq;
local_irq_disable();
node = llist_del_all(&per_cpu(ipi_lists, cpu));
while (node) {
struct llist_node *next = node->next;
rq = llist_entry(node, struct request, ll_list);
__blk_mq_end_io(rq, rq->errors);
node = next;
}
local_irq_enable();
}
}
static struct notifier_block __cpuinitdata blk_mq_main_cpu_notifier = {
.notifier_call = blk_mq_main_cpu_notify,
};
void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
{
BUG_ON(!notifier->notify);
spin_lock(&blk_mq_cpu_notify_lock);
list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
spin_unlock(&blk_mq_cpu_notify_lock);
}
void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
{
spin_lock(&blk_mq_cpu_notify_lock);
list_del(&notifier->list);
spin_unlock(&blk_mq_cpu_notify_lock);
}
void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
void (*fn)(void *, unsigned long, unsigned int),
void *data)
{
notifier->notify = fn;
notifier->data = data;
}
static struct blk_mq_cpu_notifier __cpuinitdata cpu_notifier = {
.notify = blk_mq_cpu_notify,
};
void __init blk_mq_cpu_init(void)
{
register_hotcpu_notifier(&blk_mq_main_cpu_notifier);
blk_mq_register_cpu_notifier(&cpu_notifier);
}

108
block/blk-mq-cpumap.c

@ -0,0 +1,108 @@
#include <linux/kernel.h>
#include <linux/threads.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
static void show_map(unsigned int *map, unsigned int nr)
{
int i;
pr_info("blk-mq: CPU -> queue map\n");
for_each_online_cpu(i)
pr_info(" CPU%2u -> Queue %u\n", i, map[i]);
}
static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
const int cpu)
{
return cpu / ((nr_cpus + nr_queues - 1) / nr_queues);
}
static int get_first_sibling(unsigned int cpu)
{
unsigned int ret;
ret = cpumask_first(topology_thread_cpumask(cpu));
if (ret < nr_cpu_ids)
return ret;
return cpu;
}
int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
{
unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
cpumask_var_t cpus;
if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
return 1;
cpumask_clear(cpus);
nr_cpus = nr_uniq_cpus = 0;
for_each_online_cpu(i) {
nr_cpus++;
first_sibling = get_first_sibling(i);
if (!cpumask_test_cpu(first_sibling, cpus))
nr_uniq_cpus++;
cpumask_set_cpu(i, cpus);
}
queue = 0;
for_each_possible_cpu(i) {
if (!cpu_online(i)) {
map[i] = 0;
continue;
}
/*
* Easy case - we have equal or more hardware queues. Or
* there are no thread siblings to take into account. Do
* 1:1 if enough, or sequential mapping if less.
*/
if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
queue++;
continue;
}
/*
* Less then nr_cpus queues, and we have some number of
* threads per cores. Map sibling threads to the same
* queue.
*/
first_sibling = get_first_sibling(i);
if (first_sibling == i) {
map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
queue);
queue++;
} else
map[i] = map[first_sibling];
}
show_map(map, nr_cpus);
free_cpumask_var(cpus);
return 0;
}
unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg)
{
unsigned int *map;
/* If cpus are offline, map them to first hctx */
map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
reg->numa_node);
if (!map)
return NULL;
if (!blk_mq_update_queue_map(map, reg->nr_hw_queues))
return map;
kfree(map);
return NULL;
}

384
block/blk-mq-sysfs.c

@ -0,0 +1,384 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
#include <linux/blk-mq.h>
#include "blk-mq.h"
#include "blk-mq-tag.h"
static void blk_mq_sysfs_release(struct kobject *kobj)
{
}
struct blk_mq_ctx_sysfs_entry {
struct attribute attr;
ssize_t (*show)(struct blk_mq_ctx *, char *);
ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t);
};
struct blk_mq_hw_ctx_sysfs_entry {
struct attribute attr;
ssize_t (*show)(struct blk_mq_hw_ctx *, char *);
ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t);
};
static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
char *page)
{
struct blk_mq_ctx_sysfs_entry *entry;
struct blk_mq_ctx *ctx;
struct request_queue *q;
ssize_t res;
entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
ctx = container_of(kobj, struct blk_mq_ctx, kobj);
q = ctx->queue;
if (!entry->show)
return -EIO;
res = -ENOENT;
mutex_lock(&q->sysfs_lock);
if (!blk_queue_dying(q))
res = entry->show(ctx, page);
mutex_unlock(&q->sysfs_lock);
return res;
}
static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
const char *page, size_t length)
{
struct blk_mq_ctx_sysfs_entry *entry;
struct blk_mq_ctx *ctx;
struct request_queue *q;
ssize_t res;
entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
ctx = container_of(kobj, struct blk_mq_ctx, kobj);
q = ctx->queue;
if (!entry->store)
return -EIO;
res = -ENOENT;
mutex_lock(&q->sysfs_lock);
if (!blk_queue_dying(q))
res = entry->store(ctx, page, length);
mutex_unlock(&q->sysfs_lock);
return res;
}
static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
struct attribute *attr, char *page)
{
struct blk_mq_hw_ctx_sysfs_entry *entry;
struct blk_mq_hw_ctx *hctx;
struct request_queue *q;
ssize_t res;
entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
q = hctx->queue;
if (!entry->show)
return -EIO;
res = -ENOENT;
mutex_lock(&q->sysfs_lock);
if (!blk_queue_dying(q))
res = entry->show(hctx, page);
mutex_unlock(&q->sysfs_lock);
return res;
}
static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
struct attribute *attr, const char *page,
size_t length)
{
struct blk_mq_hw_ctx_sysfs_entry *entry;
struct blk_mq_hw_ctx *hctx;
struct request_queue *q;
ssize_t res;
entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
q = hctx->queue;
if (!entry->store)
return -EIO;
res = -ENOENT;
mutex_lock(&q->sysfs_lock);
if (!blk_queue_dying(q))
res = entry->store(hctx, page, length);
mutex_unlock(&q->sysfs_lock);
return res;
}
static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page)
{
return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1],
ctx->rq_dispatched[0]);
}
static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page)
{
return sprintf(page, "%lu\n", ctx->rq_merged);
}
static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page)
{
return sprintf(page, "%lu %lu\n", ctx->rq_completed[1],
ctx->rq_completed[0]);
}
static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg)
{
char *start_page = page;
struct request *rq;
page += sprintf(page, "%s:\n", msg);
list_for_each_entry(rq, list, queuelist)
page += sprintf(page, "\t%p\n", rq);
return page - start_page;
}
static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
{
ssize_t ret;
spin_lock(&ctx->lock);
ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending");
spin_unlock(&ctx->lock);
return ret;
}
static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
char *page)
{
return sprintf(page, "%lu\n", hctx->queued);
}
static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page)
{
return sprintf(page, "%lu\n", hctx->run);
}
static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx,
char *page)
{
char *start_page = page;
int i;
page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) {
unsigned long d = 1U << (i - 1);
page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]);
}
return page - start_page;
}
static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
char *page)
{
ssize_t ret;
spin_lock(&hctx->lock);
ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending");
spin_unlock(&hctx->lock);
return ret;
}
static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page)
{
ssize_t ret;
spin_lock(&hctx->lock);
ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI));
spin_unlock(&hctx->lock);
return ret;
}
static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx,
const char *page, size_t len)
{
struct blk_mq_ctx *ctx;
unsigned long ret;
unsigned int i;
if (kstrtoul(page, 10, &ret)) {
pr_err("blk-mq-sysfs: invalid input '%s'\n", page);
return -EINVAL;
}
spin_lock(&hctx->lock);
if (ret)
hctx->flags |= BLK_MQ_F_SHOULD_IPI;
else
hctx->flags &= ~BLK_MQ_F_SHOULD_IPI;
spin_unlock(&hctx->lock);
hctx_for_each_ctx(hctx, ctx, i)
ctx->ipi_redirect = !!ret;
return len;
}
static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
{
return blk_mq_tag_sysfs_show(hctx->tags, page);
}
static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
.attr = {.name = "dispatched", .mode = S_IRUGO },
.show = blk_mq_sysfs_dispatched_show,
};
static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = {
.attr = {.name = "merged", .mode = S_IRUGO },
.show = blk_mq_sysfs_merged_show,
};
static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = {
.attr = {.name = "completed", .mode = S_IRUGO },
.show = blk_mq_sysfs_completed_show,
};
static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = {
.attr = {.name = "rq_list", .mode = S_IRUGO },
.show = blk_mq_sysfs_rq_list_show,
};
static struct attribute *default_ctx_attrs[] = {
&blk_mq_sysfs_dispatched.attr,
&blk_mq_sysfs_merged.attr,
&blk_mq_sysfs_completed.attr,
&blk_mq_sysfs_rq_list.attr,
NULL,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = {
.attr = {.name = "queued", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_queued_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = {
.attr = {.name = "run", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_run_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
.attr = {.name = "dispatched", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_dispatched_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
.attr = {.name = "pending", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_rq_list_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = {
.attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR},
.show = blk_mq_hw_sysfs_ipi_show,
.store = blk_mq_hw_sysfs_ipi_store,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
.attr = {.name = "tags", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_tags_show,
};
static struct attribute *default_hw_ctx_attrs[] = {
&blk_mq_hw_sysfs_queued.attr,
&blk_mq_hw_sysfs_run.attr,
&blk_mq_hw_sysfs_dispatched.attr,
&blk_mq_hw_sysfs_pending.attr,
&blk_mq_hw_sysfs_ipi.attr,
&blk_mq_hw_sysfs_tags.attr,
NULL,
};
static const struct sysfs_ops blk_mq_sysfs_ops = {
.show = blk_mq_sysfs_show,
.store = blk_mq_sysfs_store,
};
static const struct sysfs_ops blk_mq_hw_sysfs_ops = {
.show = blk_mq_hw_sysfs_show,
.store = blk_mq_hw_sysfs_store,
};
static struct kobj_type blk_mq_ktype = {
.sysfs_ops = &blk_mq_sysfs_ops,
.release = blk_mq_sysfs_release,
};
static struct kobj_type blk_mq_ctx_ktype = {
.sysfs_ops = &blk_mq_sysfs_ops,
.default_attrs = default_ctx_attrs,
.release = blk_mq_sysfs_release,
};
static struct kobj_type blk_mq_hw_ktype = {
.sysfs_ops = &blk_mq_hw_sysfs_ops,
.default_attrs = default_hw_ctx_attrs,
.release = blk_mq_sysfs_release,
};
void blk_mq_unregister_disk(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
kobject_del(&q->mq_kobj);
kobject_put(&disk_to_dev(disk)->kobj);
}
int blk_mq_register_disk(struct gendisk *disk)
{
struct device *dev = disk_to_dev(disk);
struct request_queue *q = disk->queue;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
int ret, i, j;
kobject_init(&q->mq_kobj, &blk_mq_ktype);
ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
if (ret < 0)
return ret;
kobject_uevent(&q->mq_kobj, KOBJ_ADD);
queue_for_each_hw_ctx(q, hctx, i) {
kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i);
if (ret)
break;
if (!hctx->nr_ctx)
continue;
hctx_for_each_ctx(hctx, ctx, j) {
kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
if (ret)
break;
}
}
if (ret) {
blk_mq_unregister_disk(disk);
return ret;
}
return 0;
}

204
block/blk-mq-tag.c

@ -0,0 +1,204 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/percpu_ida.h>
#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
/*
* Per tagged queue (tag address space) map
*/
struct blk_mq_tags {
unsigned int nr_tags;
unsigned int nr_reserved_tags;
unsigned int nr_batch_move;
unsigned int nr_max_cache;
struct percpu_ida free_tags;
struct percpu_ida reserved_tags;
};
void blk_mq_wait_for_tags(struct blk_mq_tags *tags)
{
int tag = blk_mq_get_tag(tags, __GFP_WAIT, false);
blk_mq_put_tag(tags, tag);
}
bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
{
return !tags ||
percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0;
}
static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp)
{
int tag;
tag = percpu_ida_alloc(&tags->free_tags, gfp);
if (tag < 0)
return BLK_MQ_TAG_FAIL;
return tag + tags->nr_reserved_tags;
}
static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
gfp_t gfp)
{
int tag;
if (unlikely(!tags->nr_reserved_tags)) {
WARN_ON_ONCE(1);
return BLK_MQ_TAG_FAIL;
}
tag = percpu_ida_alloc(&tags->reserved_tags, gfp);
if (tag < 0)
return BLK_MQ_TAG_FAIL;
return tag;
}
unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved)
{
if (!reserved)
return __blk_mq_get_tag(tags, gfp);
return __blk_mq_get_reserved_tag(tags, gfp);
}
static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
{
BUG_ON(tag >= tags->nr_tags);
percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags);
}
static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
unsigned int tag)
{
BUG_ON(tag >= tags->nr_reserved_tags);
percpu_ida_free(&tags->reserved_tags, tag);
}
void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
{
if (tag >= tags->nr_reserved_tags)
__blk_mq_put_tag(tags, tag);
else
__blk_mq_put_reserved_tag(tags, tag);
}
static int __blk_mq_tag_iter(unsigned id, void *data)
{
unsigned long *tag_map = data;
__set_bit(id, tag_map);
return 0;
}
void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
void (*fn)(void *, unsigned long *), void *data)
{
unsigned long *tag_map;
size_t map_size;
map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG;
tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC);
if (!tag_map)
return;
percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map);
if (tags->nr_reserved_tags)
percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter,
tag_map);
fn(data, tag_map);
kfree(tag_map);
}
struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
unsigned int reserved_tags, int node)
{
unsigned int nr_tags, nr_cache;
struct blk_mq_tags *tags;
int ret;
if (total_tags > BLK_MQ_TAG_MAX) {
pr_err("blk-mq: tag depth too large\n");
return NULL;
}
tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
if (!tags)
return NULL;