This allows filesystems and O_DIRECT to send down a list of bios
flagged for atomic completion. If the hardware supports atomic
IO, it is given the whole list in a single make_request_fn
call.
In order to limit corner cases, there are a few restrictions in the
current code:
* Every bio in the list must be for the same queue
* Every bio must be a simple write. No trims or reads may be mixed in
A new blk_queue_set_atomic_write() sets the number of atomic segments a
given driver can accept.
Any number greater than one is allowed, but the driver is expected to
do final checks on the bio list to make sure a given list fits inside
its atomic capabilities.
Signed-off-by: Chris Mason <***@fusionio.com>
---
block/blk-core.c | 217 +++++++++++++++++++++++++++++++------------------
block/blk-settings.c | 17 ++++
include/linux/blkdev.h | 14 ++++
3 files changed, 170 insertions(+), 78 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 39d1261..6a5c292 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1664,95 +1664,131 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
return 0;
}
+static void end_linked_bio(struct bio *bio, int err)
+{
+ struct bio *next;
+ do {
+ next = bio->bi_next;
+ bio->bi_next = NULL;
+ bio_endio(bio, err);
+ bio = next;
+ } while (bio);
+}
+
static noinline_for_stack bool
-generic_make_request_checks(struct bio *bio)
+generic_make_request_checks(struct bio *first_bio)
{
- struct request_queue *q;
- int nr_sectors = bio_sectors(bio);
+ struct request_queue *q = NULL;
+ int nr_sectors;
int err = -EIO;
char b[BDEVNAME_SIZE];
struct hd_struct *part;
+ struct bio *bio;
+ int linked_bio = first_bio->bi_next ? 1 : 0;
might_sleep();
- if (bio_check_eod(bio, nr_sectors))
- goto end_io;
+ bio = first_bio;
+ for_each_bio(bio) {
+ nr_sectors = bio_sectors(bio);
+ if (bio_check_eod(bio, nr_sectors))
+ goto end_io;
- q = bdev_get_queue(bio->bi_bdev);
- if (unlikely(!q)) {
- printk(KERN_ERR
- "generic_make_request: Trying to access "
- "nonexistent block-device %s (%Lu)\n",
- bdevname(bio->bi_bdev, b),
- (long long) bio->bi_iter.bi_sector);
- goto end_io;
- }
+ if (!q) {
+ q = bdev_get_queue(bio->bi_bdev);
+ if (unlikely(!q)) {
+ printk(KERN_ERR
+ "generic_make_request: Trying to access "
+ "nonexistent block-device %s (%Lu)\n",
+ bdevname(bio->bi_bdev, b),
+ (long long) bio->bi_iter.bi_sector);
+ goto end_io;
+ }
+ } else if (q != bdev_get_queue(bio->bi_bdev)) {
+ printk(KERN_ERR "generic_make_request: linked bio queue mismatch\n");
+ goto end_io;
+ }
- if (likely(bio_is_rw(bio) &&
- nr_sectors > queue_max_hw_sectors(q))) {
- printk(KERN_ERR "bio too big device %s (%u > %u)\n",
- bdevname(bio->bi_bdev, b),
- bio_sectors(bio),
- queue_max_hw_sectors(q));
- goto end_io;
- }
+ if (likely(bio_is_rw(bio) &&
+ nr_sectors > queue_max_hw_sectors(q))) {
+ printk(KERN_ERR "bio too big device %s (%u > %u)\n",
+ bdevname(bio->bi_bdev, b),
+ bio_sectors(bio),
+ queue_max_hw_sectors(q));
+ goto end_io;
+ }
- part = bio->bi_bdev->bd_part;
- if (should_fail_request(part, bio->bi_iter.bi_size) ||
- should_fail_request(&part_to_disk(part)->part0,
- bio->bi_iter.bi_size))
- goto end_io;
+ part = bio->bi_bdev->bd_part;
+ if (should_fail_request(part, bio->bi_iter.bi_size) ||
+ should_fail_request(&part_to_disk(part)->part0,
+ bio->bi_iter.bi_size))
+ goto end_io;
- /*
- * If this device has partitions, remap block n
- * of partition p to block n+start(p) of the disk.
- */
- blk_partition_remap(bio);
+ /*
+ * If this device has partitions, remap block n
+ * of partition p to block n+start(p) of the disk.
+ */
+ blk_partition_remap(bio);
- if (bio_check_eod(bio, nr_sectors))
- goto end_io;
+ if (bio_check_eod(bio, nr_sectors))
+ goto end_io;
- /*
- * Filter flush bio's early so that make_request based
- * drivers without flush support don't have to worry
- * about them.
- */
- if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
- bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
- if (!nr_sectors) {
- err = 0;
+ /*
+ * Filter flush bio's early so that make_request based
+ * drivers without flush support don't have to worry
+ * about them.
+ */
+ if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+ bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+ if (!nr_sectors) {
+ /*
+ * we don't know how to mix empty flush bios
+ * with a list of non-flush bios on devices
+ * that don't support flushing
+ */
+ if (linked_bio)
+ err = -EINVAL;
+ else
+ err = 0;
+ goto end_io;
+ }
+ }
+
+ if ((bio->bi_rw & REQ_DISCARD) &&
+ (!blk_queue_discard(q) ||
+ ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
+ err = -EOPNOTSUPP;
goto end_io;
}
- }
- if ((bio->bi_rw & REQ_DISCARD) &&
- (!blk_queue_discard(q) ||
- ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
- err = -EOPNOTSUPP;
- goto end_io;
- }
+ if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
+ err = -EOPNOTSUPP;
+ goto end_io;
+ }
- if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
- err = -EOPNOTSUPP;
- goto end_io;
- }
+ if ((bio->bi_rw & REQ_ATOMIC) &&
+ !q->limits.atomic_write_segments) {
+ err = -EOPNOTSUPP;
+ goto end_io;
+ }
- /*
- * Various block parts want %current->io_context and lazy ioc
- * allocation ends up trading a lot of pain for a small amount of
- * memory. Just allocate it upfront. This may fail and block
- * layer knows how to live with it.
- */
- create_io_context(GFP_ATOMIC, q->node);
+ /*
+ * Various block parts want %current->io_context and lazy ioc
+ * allocation ends up trading a lot of pain for a small amount of
+ * memory. Just allocate it upfront. This may fail and block
+ * layer knows how to live with it.
+ */
+ create_io_context(GFP_ATOMIC, q->node);
- if (blk_throtl_bio(q, bio))
- return false; /* throttled, will be resubmitted later */
+ if (blk_throtl_bio(q, bio))
+ return false; /* throttled, will be resubmitted later */
- trace_block_bio_queue(q, bio);
+ trace_block_bio_queue(q, bio);
+ }
return true;
end_io:
- bio_endio(bio, err);
+ end_linked_bio(first_bio, err);
return false;
}
@@ -1788,6 +1824,17 @@ void generic_make_request(struct bio *bio)
return;
/*
+ * generic_make_request checks for atomic write support, we'll have
+ * failed already if the queue doesn't support it
+ */
+ if (bio->bi_rw & REQ_ATOMIC) {
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+
+ q->make_request_fn(q, bio);
+ return;
+ }
+
+ /*
* We only want one ->make_request_fn to be active at a time, else
* stack usage with stacked devices could be a problem. So use
* current->bio_list to keep a list of requests submited by a
@@ -1815,6 +1862,10 @@ void generic_make_request(struct bio *bio)
* from the top. In this case we really did just take the bio
* of the top of the list (no pretending) and so remove it from
* bio_list, and call into ->make_request() again.
+ *
+ * REQ_ATOMIC bios may have been chained on bi_next, but we
+ * should have caught them all above. This BUG_ON(bi_next)
+ * will catch any lists of bios that were not flagged as atomic
*/
BUG_ON(bio->bi_next);
bio_list_init(&bio_list_on_stack);
@@ -1849,28 +1900,38 @@ void submit_bio(int rw, struct bio *bio)
* go through the normal accounting stuff before submission.
*/
if (bio_has_data(bio)) {
- unsigned int count;
-
- if (unlikely(rw & REQ_WRITE_SAME))
- count = bdev_logical_block_size(bio->bi_bdev) >> 9;
- else
- count = bio_sectors(bio);
+ unsigned int count = 0;
+ unsigned int size = 0;
+ struct bio *walk;
+
+ walk = bio;
+ for_each_bio(walk) {
+ if (unlikely(rw & REQ_WRITE_SAME))
+ count += bdev_logical_block_size(walk->bi_bdev) >> 9;
+ else
+ count += bio_sectors(walk);
+ size += walk->bi_iter.bi_size;
+ }
if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
} else {
- task_io_account_read(bio->bi_iter.bi_size);
+ task_io_account_read(size);
count_vm_events(PGPGIN, count);
}
if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
- printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
- current->comm, task_pid_nr(current),
- (rw & WRITE) ? "WRITE" : "READ",
- (unsigned long long)bio->bi_iter.bi_sector,
- bdevname(bio->bi_bdev, b),
- count);
+
+ walk = bio;
+ for_each_bio(walk) {
+ printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
+ current->comm, task_pid_nr(current),
+ (rw & WRITE) ? "WRITE" : "READ",
+ (unsigned long long)walk->bi_iter.bi_sector,
+ bdevname(walk->bi_bdev, b),
+ bio_sectors(walk));
+ }
}
}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5330933..17a6d23 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -119,6 +119,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->discard_alignment = 0;
lim->discard_misaligned = 0;
lim->discard_zeroes_data = 0;
+ lim->atomic_write_segments = 0;
lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
lim->alignment_offset = 0;
@@ -804,6 +805,22 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
EXPORT_SYMBOL(blk_queue_update_dma_alignment);
/**
+ * blk_queue_set_atomic_write - number of segments supported for atomic writes
+ * @q: the request queue for the device
+ * @segments: number of segments supported
+ *
+ * description:
+ * If the device supports atomic (or transactional) writes, then it can pass
+ * the maximum number of segments it supports in here. Atomic writes are
+ * either completed as a whole, or none of it gets written.
+ **/
+void blk_queue_set_atomic_write(struct request_queue *q, unsigned int segments)
+{
+ q->limits.atomic_write_segments = segments;
+}
+EXPORT_SYMBOL(blk_queue_set_atomic_write);
+
+/**
* blk_queue_flush - configure queue's cache flush capability
* @q: the request queue for the device
* @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ca0119d..40238bf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -283,6 +283,8 @@ struct queue_limits {
unsigned int discard_granularity;
unsigned int discard_alignment;
+ unsigned int atomic_write_segments;
+
unsigned short logical_block_size;
unsigned short max_segments;
unsigned short max_integrity_segments;
@@ -968,6 +970,8 @@ extern void blk_queue_logical_block_size(struct request_queue *, unsigned short)
extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
extern void blk_queue_alignment_offset(struct request_queue *q,
unsigned int alignment);
+extern void blk_queue_set_atomic_write(struct request_queue *q,
+ unsigned int segments);
extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
@@ -1190,6 +1194,16 @@ static inline unsigned short queue_logical_block_size(struct request_queue *q)
return retval;
}
+static inline unsigned short bdev_atomic_write_segments(struct block_device *bdev)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (q)
+ return q->limits.atomic_write_segments;
+
+ return 0;
+}
+
static inline unsigned short bdev_logical_block_size(struct block_device *bdev)
{
return queue_logical_block_size(bdev_get_queue(bdev));
--
1.8.2
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html