Discussion:
[PATCH 0/2] Support for atomic IOs
Chris Mason
2013-11-01 21:27:04 UTC
Permalink
Hi everyone,

I'm still testing this (especially to look for unrelated regressions in
MD and DM), but wanted to send it out for comments.

These two patches implement support for atomic IOs down to the hardware.
The basic idea is that we give a list of bios to the storage and even
if they are discontiguous the hardware promises to complete them as a
single atomic unit.

The first user for this is O_DIRECT, especially for mysql based
databases. O_ATOMIC | O_DIRECT allows mysql and friends to disable
double buffering. This cuts their write IO in half, making them
roughly 2x more flash friendly.

The patches are on top of Jens' current blk-mq/core tree. Jens
also has chaining patches from Kent, but he is chaining bios likely to
be sent to separate devices, while I'm chaining bios that must
(currently) be sent to the same device as a group.

As far as I can tell, there isn't much overlap in our targets or
methods, but I'd like to eventually merge the ideas together.

-chris

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Chris Mason
2013-11-01 21:28:54 UTC
Permalink
This allows filesystems and O_DIRECT to send down a list of bios
flagged for atomic completion. If the hardware supports atomic
IO, it is given the whole list in a single make_request_fn
call.

In order to limit corner cases, there are a few restrictions in the
current code:

* Every bio in the list must be for the same queue

* Every bio must be a simple write. No trims or reads may be mixed in

A new blk_queue_set_atomic_write() sets the number of atomic segments a
given driver can accept.

Any number greater than one is allowed, but the driver is expected to
do final checks on the bio list to make sure a given list fits inside
its atomic capabilities.

Signed-off-by: Chris Mason <***@fusionio.com>
---
block/blk-core.c | 217 +++++++++++++++++++++++++++++++------------------
block/blk-settings.c | 17 ++++
include/linux/blkdev.h | 14 ++++
3 files changed, 170 insertions(+), 78 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 39d1261..6a5c292 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1664,95 +1664,131 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
return 0;
}

+static void end_linked_bio(struct bio *bio, int err)
+{
+ struct bio *next;
+ do {
+ next = bio->bi_next;
+ bio->bi_next = NULL;
+ bio_endio(bio, err);
+ bio = next;
+ } while (bio);
+}
+
static noinline_for_stack bool
-generic_make_request_checks(struct bio *bio)
+generic_make_request_checks(struct bio *first_bio)
{
- struct request_queue *q;
- int nr_sectors = bio_sectors(bio);
+ struct request_queue *q = NULL;
+ int nr_sectors;
int err = -EIO;
char b[BDEVNAME_SIZE];
struct hd_struct *part;
+ struct bio *bio;
+ int linked_bio = first_bio->bi_next ? 1 : 0;

might_sleep();

- if (bio_check_eod(bio, nr_sectors))
- goto end_io;
+ bio = first_bio;
+ for_each_bio(bio) {
+ nr_sectors = bio_sectors(bio);
+ if (bio_check_eod(bio, nr_sectors))
+ goto end_io;

- q = bdev_get_queue(bio->bi_bdev);
- if (unlikely(!q)) {
- printk(KERN_ERR
- "generic_make_request: Trying to access "
- "nonexistent block-device %s (%Lu)\n",
- bdevname(bio->bi_bdev, b),
- (long long) bio->bi_iter.bi_sector);
- goto end_io;
- }
+ if (!q) {
+ q = bdev_get_queue(bio->bi_bdev);
+ if (unlikely(!q)) {
+ printk(KERN_ERR
+ "generic_make_request: Trying to access "
+ "nonexistent block-device %s (%Lu)\n",
+ bdevname(bio->bi_bdev, b),
+ (long long) bio->bi_iter.bi_sector);
+ goto end_io;
+ }
+ } else if (q != bdev_get_queue(bio->bi_bdev)) {
+ printk(KERN_ERR "generic_make_request: linked bio queue mismatch\n");
+ goto end_io;
+ }

- if (likely(bio_is_rw(bio) &&
- nr_sectors > queue_max_hw_sectors(q))) {
- printk(KERN_ERR "bio too big device %s (%u > %u)\n",
- bdevname(bio->bi_bdev, b),
- bio_sectors(bio),
- queue_max_hw_sectors(q));
- goto end_io;
- }
+ if (likely(bio_is_rw(bio) &&
+ nr_sectors > queue_max_hw_sectors(q))) {
+ printk(KERN_ERR "bio too big device %s (%u > %u)\n",
+ bdevname(bio->bi_bdev, b),
+ bio_sectors(bio),
+ queue_max_hw_sectors(q));
+ goto end_io;
+ }

- part = bio->bi_bdev->bd_part;
- if (should_fail_request(part, bio->bi_iter.bi_size) ||
- should_fail_request(&part_to_disk(part)->part0,
- bio->bi_iter.bi_size))
- goto end_io;
+ part = bio->bi_bdev->bd_part;
+ if (should_fail_request(part, bio->bi_iter.bi_size) ||
+ should_fail_request(&part_to_disk(part)->part0,
+ bio->bi_iter.bi_size))
+ goto end_io;

- /*
- * If this device has partitions, remap block n
- * of partition p to block n+start(p) of the disk.
- */
- blk_partition_remap(bio);
+ /*
+ * If this device has partitions, remap block n
+ * of partition p to block n+start(p) of the disk.
+ */
+ blk_partition_remap(bio);

- if (bio_check_eod(bio, nr_sectors))
- goto end_io;
+ if (bio_check_eod(bio, nr_sectors))
+ goto end_io;

- /*
- * Filter flush bio's early so that make_request based
- * drivers without flush support don't have to worry
- * about them.
- */
- if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
- bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
- if (!nr_sectors) {
- err = 0;
+ /*
+ * Filter flush bio's early so that make_request based
+ * drivers without flush support don't have to worry
+ * about them.
+ */
+ if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+ bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+ if (!nr_sectors) {
+ /*
+ * we don't know how to mix empty flush bios
+ * with a list of non-flush bios on devices
+ * that don't support flushing
+ */
+ if (linked_bio)
+ err = -EINVAL;
+ else
+ err = 0;
+ goto end_io;
+ }
+ }
+
+ if ((bio->bi_rw & REQ_DISCARD) &&
+ (!blk_queue_discard(q) ||
+ ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
+ err = -EOPNOTSUPP;
goto end_io;
}
- }

- if ((bio->bi_rw & REQ_DISCARD) &&
- (!blk_queue_discard(q) ||
- ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
- err = -EOPNOTSUPP;
- goto end_io;
- }
+ if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
+ err = -EOPNOTSUPP;
+ goto end_io;
+ }

- if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
- err = -EOPNOTSUPP;
- goto end_io;
- }
+ if ((bio->bi_rw & REQ_ATOMIC) &&
+ !q->limits.atomic_write_segments) {
+ err = -EOPNOTSUPP;
+ goto end_io;
+ }

- /*
- * Various block parts want %current->io_context and lazy ioc
- * allocation ends up trading a lot of pain for a small amount of
- * memory. Just allocate it upfront. This may fail and block
- * layer knows how to live with it.
- */
- create_io_context(GFP_ATOMIC, q->node);
+ /*
+ * Various block parts want %current->io_context and lazy ioc
+ * allocation ends up trading a lot of pain for a small amount of
+ * memory. Just allocate it upfront. This may fail and block
+ * layer knows how to live with it.
+ */
+ create_io_context(GFP_ATOMIC, q->node);

- if (blk_throtl_bio(q, bio))
- return false; /* throttled, will be resubmitted later */
+ if (blk_throtl_bio(q, bio))
+ return false; /* throttled, will be resubmitted later */

- trace_block_bio_queue(q, bio);
+ trace_block_bio_queue(q, bio);
+ }
return true;

end_io:
- bio_endio(bio, err);
+ end_linked_bio(first_bio, err);
return false;
}

@@ -1788,6 +1824,17 @@ void generic_make_request(struct bio *bio)
return;

/*
+ * generic_make_request checks for atomic write support, we'll have
+ * failed already if the queue doesn't support it
+ */
+ if (bio->bi_rw & REQ_ATOMIC) {
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+
+ q->make_request_fn(q, bio);
+ return;
+ }
+
+ /*
* We only want one ->make_request_fn to be active at a time, else
* stack usage with stacked devices could be a problem. So use
* current->bio_list to keep a list of requests submited by a
@@ -1815,6 +1862,10 @@ void generic_make_request(struct bio *bio)
* from the top. In this case we really did just take the bio
* of the top of the list (no pretending) and so remove it from
* bio_list, and call into ->make_request() again.
+ *
+ * REQ_ATOMIC bios may have been chained on bi_next, but we
+ * should have caught them all above. This BUG_ON(bi_next)
+ * will catch any lists of bios that were not flagged as atomic
*/
BUG_ON(bio->bi_next);
bio_list_init(&bio_list_on_stack);
@@ -1849,28 +1900,38 @@ void submit_bio(int rw, struct bio *bio)
* go through the normal accounting stuff before submission.
*/
if (bio_has_data(bio)) {
- unsigned int count;
-
- if (unlikely(rw & REQ_WRITE_SAME))
- count = bdev_logical_block_size(bio->bi_bdev) >> 9;
- else
- count = bio_sectors(bio);
+ unsigned int count = 0;
+ unsigned int size = 0;
+ struct bio *walk;
+
+ walk = bio;
+ for_each_bio(walk) {
+ if (unlikely(rw & REQ_WRITE_SAME))
+ count += bdev_logical_block_size(walk->bi_bdev) >> 9;
+ else
+ count += bio_sectors(walk);
+ size += walk->bi_iter.bi_size;
+ }

if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
} else {
- task_io_account_read(bio->bi_iter.bi_size);
+ task_io_account_read(size);
count_vm_events(PGPGIN, count);
}

if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
- printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
- current->comm, task_pid_nr(current),
- (rw & WRITE) ? "WRITE" : "READ",
- (unsigned long long)bio->bi_iter.bi_sector,
- bdevname(bio->bi_bdev, b),
- count);
+
+ walk = bio;
+ for_each_bio(walk) {
+ printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
+ current->comm, task_pid_nr(current),
+ (rw & WRITE) ? "WRITE" : "READ",
+ (unsigned long long)walk->bi_iter.bi_sector,
+ bdevname(walk->bi_bdev, b),
+ bio_sectors(walk));
+ }
}
}

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5330933..17a6d23 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -119,6 +119,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->discard_alignment = 0;
lim->discard_misaligned = 0;
lim->discard_zeroes_data = 0;
+ lim->atomic_write_segments = 0;
lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
lim->alignment_offset = 0;
@@ -804,6 +805,22 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
EXPORT_SYMBOL(blk_queue_update_dma_alignment);

/**
+ * blk_queue_set_atomic_write - number of segments supported for atomic writes
+ * @q: the request queue for the device
+ * @segments: number of segments supported
+ *
+ * description:
+ * If the device supports atomic (or transactional) writes, then it can pass
+ * the maximum number of segments it supports in here. Atomic writes are
+ * either completed as a whole, or none of it gets written.
+ **/
+void blk_queue_set_atomic_write(struct request_queue *q, unsigned int segments)
+{
+ q->limits.atomic_write_segments = segments;
+}
+EXPORT_SYMBOL(blk_queue_set_atomic_write);
+
+/**
* blk_queue_flush - configure queue's cache flush capability
* @q: the request queue for the device
* @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ca0119d..40238bf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -283,6 +283,8 @@ struct queue_limits {
unsigned int discard_granularity;
unsigned int discard_alignment;

+ unsigned int atomic_write_segments;
+
unsigned short logical_block_size;
unsigned short max_segments;
unsigned short max_integrity_segments;
@@ -968,6 +970,8 @@ extern void blk_queue_logical_block_size(struct request_queue *, unsigned short)
extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
extern void blk_queue_alignment_offset(struct request_queue *q,
unsigned int alignment);
+extern void blk_queue_set_atomic_write(struct request_queue *q,
+ unsigned int segments);
extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
@@ -1190,6 +1194,16 @@ static inline unsigned short queue_logical_block_size(struct request_queue *q)
return retval;
}

+static inline unsigned short bdev_atomic_write_segments(struct block_device *bdev)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (q)
+ return q->limits.atomic_write_segments;
+
+ return 0;
+}
+
static inline unsigned short bdev_logical_block_size(struct block_device *bdev)
{
return queue_logical_block_size(bdev_get_queue(bdev));
--
1.8.2

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Shaohua Li
2013-11-01 21:47:28 UTC
Permalink
Post by Chris Mason
This allows filesystems and O_DIRECT to send down a list of bios
flagged for atomic completion. If the hardware supports atomic
IO, it is given the whole list in a single make_request_fn
call.
In order to limit corner cases, there are a few restrictions in the
* Every bio in the list must be for the same queue
* Every bio must be a simple write. No trims or reads may be mixed in
A new blk_queue_set_atomic_write() sets the number of atomic segments a
given driver can accept.
If the queue is request based, I thought we have problem with linked bios. For
example, init request will be wrong with linked bios.
Post by Chris Mason
Any number greater than one is allowed, but the driver is expected to
do final checks on the bio list to make sure a given list fits inside
its atomic capabilities.
---
block/blk-core.c | 217 +++++++++++++++++++++++++++++++------------------
block/blk-settings.c | 17 ++++
include/linux/blkdev.h | 14 ++++
3 files changed, 170 insertions(+), 78 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 39d1261..6a5c292 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1664,95 +1664,131 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
return 0;
}
+static void end_linked_bio(struct bio *bio, int err)
+{
+ struct bio *next;
+ do {
+ next = bio->bi_next;
+ bio->bi_next = NULL;
+ bio_endio(bio, err);
+ bio = next;
+ } while (bio);
+}
+
static noinline_for_stack bool
-generic_make_request_checks(struct bio *bio)
+generic_make_request_checks(struct bio *first_bio)
{
- struct request_queue *q;
- int nr_sectors = bio_sectors(bio);
+ struct request_queue *q = NULL;
+ int nr_sectors;
int err = -EIO;
char b[BDEVNAME_SIZE];
struct hd_struct *part;
+ struct bio *bio;
+ int linked_bio = first_bio->bi_next ? 1 : 0;
might_sleep();
- if (bio_check_eod(bio, nr_sectors))
- goto end_io;
+ bio = first_bio;
+ for_each_bio(bio) {
+ nr_sectors = bio_sectors(bio);
+ if (bio_check_eod(bio, nr_sectors))
+ goto end_io;
- q = bdev_get_queue(bio->bi_bdev);
- if (unlikely(!q)) {
- printk(KERN_ERR
- "generic_make_request: Trying to access "
- "nonexistent block-device %s (%Lu)\n",
- bdevname(bio->bi_bdev, b),
- (long long) bio->bi_iter.bi_sector);
- goto end_io;
- }
+ if (!q) {
+ q = bdev_get_queue(bio->bi_bdev);
+ if (unlikely(!q)) {
+ printk(KERN_ERR
+ "generic_make_request: Trying to access "
+ "nonexistent block-device %s (%Lu)\n",
+ bdevname(bio->bi_bdev, b),
+ (long long) bio->bi_iter.bi_sector);
+ goto end_io;
+ }
+ } else if (q != bdev_get_queue(bio->bi_bdev)) {
+ printk(KERN_ERR "generic_make_request: linked bio queue mismatch\n");
+ goto end_io;
+ }
- if (likely(bio_is_rw(bio) &&
- nr_sectors > queue_max_hw_sectors(q))) {
- printk(KERN_ERR "bio too big device %s (%u > %u)\n",
- bdevname(bio->bi_bdev, b),
- bio_sectors(bio),
- queue_max_hw_sectors(q));
- goto end_io;
- }
+ if (likely(bio_is_rw(bio) &&
+ nr_sectors > queue_max_hw_sectors(q))) {
+ printk(KERN_ERR "bio too big device %s (%u > %u)\n",
+ bdevname(bio->bi_bdev, b),
+ bio_sectors(bio),
+ queue_max_hw_sectors(q));
+ goto end_io;
+ }
- part = bio->bi_bdev->bd_part;
- if (should_fail_request(part, bio->bi_iter.bi_size) ||
- should_fail_request(&part_to_disk(part)->part0,
- bio->bi_iter.bi_size))
- goto end_io;
+ part = bio->bi_bdev->bd_part;
+ if (should_fail_request(part, bio->bi_iter.bi_size) ||
+ should_fail_request(&part_to_disk(part)->part0,
+ bio->bi_iter.bi_size))
+ goto end_io;
- /*
- * If this device has partitions, remap block n
- * of partition p to block n+start(p) of the disk.
- */
- blk_partition_remap(bio);
+ /*
+ * If this device has partitions, remap block n
+ * of partition p to block n+start(p) of the disk.
+ */
+ blk_partition_remap(bio);
- if (bio_check_eod(bio, nr_sectors))
- goto end_io;
+ if (bio_check_eod(bio, nr_sectors))
+ goto end_io;
- /*
- * Filter flush bio's early so that make_request based
- * drivers without flush support don't have to worry
- * about them.
- */
- if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
- bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
- if (!nr_sectors) {
- err = 0;
+ /*
+ * Filter flush bio's early so that make_request based
+ * drivers without flush support don't have to worry
+ * about them.
+ */
+ if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+ bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+ if (!nr_sectors) {
+ /*
+ * we don't know how to mix empty flush bios
+ * with a list of non-flush bios on devices
+ * that don't support flushing
+ */
+ if (linked_bio)
+ err = -EINVAL;
+ else
+ err = 0;
+ goto end_io;
+ }
+ }
+
+ if ((bio->bi_rw & REQ_DISCARD) &&
+ (!blk_queue_discard(q) ||
+ ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
+ err = -EOPNOTSUPP;
goto end_io;
}
- }
- if ((bio->bi_rw & REQ_DISCARD) &&
- (!blk_queue_discard(q) ||
- ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
- err = -EOPNOTSUPP;
- goto end_io;
- }
+ if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
+ err = -EOPNOTSUPP;
+ goto end_io;
+ }
- if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
- err = -EOPNOTSUPP;
- goto end_io;
- }
+ if ((bio->bi_rw & REQ_ATOMIC) &&
+ !q->limits.atomic_write_segments) {
+ err = -EOPNOTSUPP;
+ goto end_io;
+ }
Looks we don't check if the queue is a stacked queue for MD/DM. And I didn't
see we check if atomic bios number exceeds atomic_write_segments.
Post by Chris Mason
- /*
- * Various block parts want %current->io_context and lazy ioc
- * allocation ends up trading a lot of pain for a small amount of
- * memory. Just allocate it upfront. This may fail and block
- * layer knows how to live with it.
- */
- create_io_context(GFP_ATOMIC, q->node);
+ /*
+ * Various block parts want %current->io_context and lazy ioc
+ * allocation ends up trading a lot of pain for a small amount of
+ * memory. Just allocate it upfront. This may fail and block
+ * layer knows how to live with it.
+ */
+ create_io_context(GFP_ATOMIC, q->node);
- if (blk_throtl_bio(q, bio))
- return false; /* throttled, will be resubmitted later */
+ if (blk_throtl_bio(q, bio))
+ return false; /* throttled, will be resubmitted later */
- trace_block_bio_queue(q, bio);
+ trace_block_bio_queue(q, bio);
+ }
return true;
- bio_endio(bio, err);
+ end_linked_bio(first_bio, err);
return false;
}
@@ -1788,6 +1824,17 @@ void generic_make_request(struct bio *bio)
return;
/*
+ * generic_make_request checks for atomic write support, we'll have
+ * failed already if the queue doesn't support it
+ */
+ if (bio->bi_rw & REQ_ATOMIC) {
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+
+ q->make_request_fn(q, bio);
directly sending requests out has problem for MD/DM. But We should fail early
for MD/DM check, so this isn't a problem if we detected this is a stacked queue.

Thanks,
Shaohua
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Jeff Moyer
2013-11-05 17:43:31 UTC
Permalink
Post by Chris Mason
This allows filesystems and O_DIRECT to send down a list of bios
flagged for atomic completion. If the hardware supports atomic
IO, it is given the whole list in a single make_request_fn
call.
In order to limit corner cases, there are a few restrictions in the
* Every bio in the list must be for the same queue
* Every bio must be a simple write. No trims or reads may be mixed in
A new blk_queue_set_atomic_write() sets the number of atomic segments a
given driver can accept.
Any number greater than one is allowed, but the driver is expected to
do final checks on the bio list to make sure a given list fits inside
its atomic capabilities.
Hi, Chris,

This is great stuff. I have a couple of high level questions that I'm
hoping you can answer, given that you're closer to the hardware than
most. What constraints can we expect hardware to impose on atomic
writes in terms of size and, um, contiguousness (is that a word)? How
do we communicate those constraints to the application? (I'm not
convinced a sysfs file is adequate.)

For example, looking at NVMe, it appears that devices may guarantee that
a set of /sequential/ logical blocks may be completed atomically, but I
don't see a provision for disjoint regions. That spec also
differentiates between power fail write atomicity and "normal" write
atomicity.

Basically, I'd like to avoid requiring a trial and error programming
model to determine what an application can expect to work (like we have
with O_DIRECT right now).

Cheers,
Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Chris Mason
2013-11-07 13:52:20 UTC
Permalink
Quoting Jeff Moyer (2013-11-05 12:43:31)
Post by Jeff Moyer
Post by Chris Mason
This allows filesystems and O_DIRECT to send down a list of bios
flagged for atomic completion. If the hardware supports atomic
IO, it is given the whole list in a single make_request_fn
call.
In order to limit corner cases, there are a few restrictions in the
* Every bio in the list must be for the same queue
* Every bio must be a simple write. No trims or reads may be mixed in
A new blk_queue_set_atomic_write() sets the number of atomic segments a
given driver can accept.
Any number greater than one is allowed, but the driver is expected to
do final checks on the bio list to make sure a given list fits inside
its atomic capabilities.
Hi, Chris,
This is great stuff. I have a couple of high level questions that I'm
hoping you can answer, given that you're closer to the hardware than
most. What constraints can we expect hardware to impose on atomic
writes in terms of size and, um, contiguousness (is that a word)? How
do we communicate those constraints to the application? (I'm not
convinced a sysfs file is adequate.)
For example, looking at NVMe, it appears that devices may guarantee that
a set of /sequential/ logical blocks may be completed atomically, but I
don't see a provision for disjoint regions. That spec also
differentiates between power fail write atomicity and "normal" write
atomicity.
Unfortunately, it's hard to say. I think the fusionio cards are the
only shipping devices that support this, but I've definitely heard that
others plan to support it as well. mariadb/percona already support the
atomics via fusionio specific ioctls, and turning that into a real
O_ATOMIC is a priority so other hardware can just hop on the train.

This feature in general is pretty natural for the log structured squirrels
they stuff inside flash, so I'd expect everyone to support it. Matthew,
how do you feel about all of this?

With the fusionio drivers, we've recently increased the max atomic size.
It's basically 1MB, disjoint or contig doesn't matter. We're powercut
safe at 1MB.
Post by Jeff Moyer
Basically, I'd like to avoid requiring a trial and error programming
model to determine what an application can expect to work (like we have
with O_DIRECT right now).
I'm really interested in ideas on how to provide that. But, with dm,
md, and a healthy assortment of flash vendors, I don't know how...

I've attached my current test program. The basic idea is to fill
buffers (1MB in size) with a random pattern. Each buffer has a
different random pattern.

You let it run for a while and then pull the plug. After the box comes
back up, run the program again and it looks for consistent patterns
filling each 1MB aligned region in the file.

Usage:

gcc -Wall -o atomic-pattern atomic-pattern.c

create a heavily fragmented file (exercise for the user, I need
to make a mode for this)

atomic-pattern file_name init

<wait for init done printf to appear>
<let it run for a while>
<cut power to the box>
<box comes back to life>

atomic-pattern file_name check


In order to reliably find torn blocks without O_ATOMIC, I had to bump
the write size to 1MB and run 24 instances in parallel.

/*
* Copyright 2013 Fusion-io
* GPLv2 or higher license
*/

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <errno.h>

#define FILE_SIZE (300 * 1024 * 1024)
#define O_DIRECT 00040000ULL
#define O_ATOMIC 040000000ULL

void set_block_headers(unsigned char *buf, int buffer_size, unsigned long seq)
{
while (buffer_size > sizeof(seq)) {
memcpy(buf, &seq, sizeof(seq));
buffer_size -= sizeof(seq);
buf += sizeof(seq);
}
}

int check_block_headers(unsigned char *buf, int buffer_size)
{
unsigned long seq = 0;
unsigned long check = 0;
memcpy(&seq, buf, sizeof(seq));
buffer_size -= sizeof(seq);

while (buffer_size > sizeof(seq)) {
memcpy(&check, buf, sizeof(check));
if (check != seq) {
fprintf(stderr, "check failed %lx %lx\n", seq, check);
return -EIO;
}
buffer_size -= sizeof(seq);
buf += sizeof(seq);
}
return 0;
}

int main(int ac, char **av)
{
unsigned char *file_buf;
loff_t pos;
int ret;
int fd;
int write_size = 1024 * 1024;
char *filename = av[1];
int check = 0;
int init = 0;

if (ac < 2) {
fprintf(stderr, "usage: atomic-pattern filename [check | init]\n");
exit(1);
}
if (ac > 2) {
if (!strcmp(av[2], "check")) {
check = 1;
fprintf(stderr, "checking %s\n", filename);
} else if (!strcmp(av[2], "init")) {
init = 1;
fprintf(stderr, "init %s\n", filename);
} else {
fprintf(stderr, "usage: atomic-pattern filename [check | init]\n");
exit(1);
}
}

ret = posix_memalign((void **)&file_buf, 4096, write_size);
if (ret) {
perror("cannot allocate memory\n");
exit(1);
}

fd = open(filename, O_RDWR, 0600);
if (fd < 0) {
perror("open");
exit(1);
}

ret = fcntl (fd, F_SETFL, O_DIRECT | O_ATOMIC);
if (ret) {
perror("fcntl");
exit(1);
}
pos = 0;
if (!init && !check)
goto runit;

while (pos < FILE_SIZE) {
if (check) {
ret = pread(fd, file_buf, write_size, pos);
if (ret != write_size) {
perror("write");
exit(1);
}
ret = check_block_headers(file_buf, write_size);
if (ret) {
fprintf(stderr, "Failed check on buffer %llu\n", (unsigned long long)pos);
exit(1);
}
} else {
set_block_headers(file_buf, write_size, rand());

ret = pwrite(fd, file_buf, write_size, pos);
if (ret != write_size) {
perror("write");
exit(1);
}
}
pos += write_size;
}
if (check)
exit(0);

fsync(fd);
runit:
fprintf(stderr, "File init done, running random writes\n");

while (1) {
pos = rand() % FILE_SIZE;

pos = pos / write_size;
pos = pos * write_size;

if (pos + write_size > FILE_SIZE)
pos = 0;

set_block_headers(file_buf, write_size, rand());

ret = pwrite(fd, file_buf, write_size, pos);
if (ret != write_size) {
perror("write");
exit(1);
}
}
return 0;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Jeff Moyer
2013-11-07 15:43:41 UTC
Permalink
Post by Chris Mason
Unfortunately, it's hard to say. I think the fusionio cards are the
only shipping devices that support this, but I've definitely heard that
others plan to support it as well. mariadb/percona already support the
atomics via fusionio specific ioctls, and turning that into a real
O_ATOMIC is a priority so other hardware can just hop on the train.
This feature in general is pretty natural for the log structured squirrels
they stuff inside flash, so I'd expect everyone to support it. Matthew,
how do you feel about all of this?
With the fusionio drivers, we've recently increased the max atomic size.
It's basically 1MB, disjoint or contig doesn't matter. We're powercut
safe at 1MB.
Post by Jeff Moyer
Basically, I'd like to avoid requiring a trial and error programming
model to determine what an application can expect to work (like we have
with O_DIRECT right now).
I'm really interested in ideas on how to provide that. But, with dm,
md, and a healthy assortment of flash vendors, I don't know how...
Well, we have control over dm and md, so I'm not worried about that.
For the storage vendors, we'll have to see about influencing the
standards bodies.

The way I see it, there are 3 pieces of information that are required:
1) minimum size that is atomic (likely the physical block size, but
maybe the logical block size?)
2) maximum size that is atomic (multiple of minimum size)
3) whether or not discontiguous ranges are supported

Did I miss anything?
Post by Chris Mason
I've attached my current test program. The basic idea is to fill
buffers (1MB in size) with a random pattern. Each buffer has a
different random pattern.
You let it run for a while and then pull the plug. After the box comes
back up, run the program again and it looks for consistent patterns
filling each 1MB aligned region in the file.
[snip]
Post by Chris Mason
In order to reliably find torn blocks without O_ATOMIC, I had to bump
the write size to 1MB and run 24 instances in parallel.
Thanks for the program (I actually have my own setup for verifying torn
writes, the veritable dainto[1], which nobody uses). Just to be certain,
you did bump /sys/block/<dev>/queue/max_sectors_kb to 1MB, right?

Cheers,
Jeff

[1] http://people.redhat.com/jmoyer/dainto-0.99.4.tar.bz2
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Chris Mason
2013-11-07 15:55:54 UTC
Permalink
Quoting Jeff Moyer (2013-11-07 10:43:41)
Post by Jeff Moyer
Post by Chris Mason
Unfortunately, it's hard to say. I think the fusionio cards are the
only shipping devices that support this, but I've definitely heard that
others plan to support it as well. mariadb/percona already support the
atomics via fusionio specific ioctls, and turning that into a real
O_ATOMIC is a priority so other hardware can just hop on the train.
This feature in general is pretty natural for the log structured squirrels
they stuff inside flash, so I'd expect everyone to support it. Matthew,
how do you feel about all of this?
With the fusionio drivers, we've recently increased the max atomic size.
It's basically 1MB, disjoint or contig doesn't matter. We're powercut
safe at 1MB.
Post by Jeff Moyer
Basically, I'd like to avoid requiring a trial and error programming
model to determine what an application can expect to work (like we have
with O_DIRECT right now).
I'm really interested in ideas on how to provide that. But, with dm,
md, and a healthy assortment of flash vendors, I don't know how...
Well, we have control over dm and md, so I'm not worried about that.
For the storage vendors, we'll have to see about influencing the
standards bodies.
1) minimum size that is atomic (likely the physical block size, but
maybe the logical block size?)
2) maximum size that is atomic (multiple of minimum size)
3) whether or not discontiguous ranges are supported
Did I miss anything?
It'll vary from vendor to vendor. A discontig range of two 512KB areas
is different from 256 distcontig 4KB areas.

And it's completely dependent on filesystem fragmentation. So, a given
IO might pass for one file and fail for the next.

In a DM/MD configuration, an atomic IO inside a single stripe on raid0
could succeed while it will fail if it spans two stripes to two
different devices.
Post by Jeff Moyer
Post by Chris Mason
I've attached my current test program. The basic idea is to fill
buffers (1MB in size) with a random pattern. Each buffer has a
different random pattern.
You let it run for a while and then pull the plug. After the box comes
back up, run the program again and it looks for consistent patterns
filling each 1MB aligned region in the file.
[snip]
Post by Chris Mason
In order to reliably find torn blocks without O_ATOMIC, I had to bump
the write size to 1MB and run 24 instances in parallel.
Thanks for the program (I actually have my own setup for verifying torn
writes, the veritable dainto[1], which nobody uses). Just to be certain,
you did bump /sys/block/<dev>/queue/max_sectors_kb to 1MB, right?
Since the atomics patch does things as a list of bios, there's no
max_sectors_kb to worry about. Each individual bio was only 4K.

-chris

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Jeff Moyer
2013-11-07 16:14:02 UTC
Permalink
Post by Chris Mason
Post by Jeff Moyer
Well, we have control over dm and md, so I'm not worried about that.
For the storage vendors, we'll have to see about influencing the
standards bodies.
1) minimum size that is atomic (likely the physical block size, but
maybe the logical block size?)
2) maximum size that is atomic (multiple of minimum size)
3) whether or not discontiguous ranges are supported
Did I miss anything?
It'll vary from vendor to vendor. A discontig range of two 512KB areas
is different from 256 distcontig 4KB areas.
Sure.
Post by Chris Mason
And it's completely dependent on filesystem fragmentation. So, a given
IO might pass for one file and fail for the next.
Worse, it could pass for one region of a file and fail for a different
region of the same file.

I guess you could export the most conservative estimate, based on
completely non-contiguous smallest sized segments. Things larger may
work, but they may not. Perhaps this would be too limiting, I don't
know.
Post by Chris Mason
In a DM/MD configuration, an atomic IO inside a single stripe on raid0
could succeed while it will fail if it spans two stripes to two
different devices.
I'd say that if you are spanning multiple devices, you don't support
O_ATOMIC. You could write a specific dm target that allows it, but I
don't think it's a priority to support it in the way your example does.

Given that there are applications using your implementation, what did
they determine was a sane way to do things? Only access the block
device? Preallocate files? Fallback to non-atomic writes + fsync?
Something else?

Cheers,
Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Chris Mason
2013-11-07 16:52:49 UTC
Permalink
Quoting Jeff Moyer (2013-11-07 11:14:02)
Post by Jeff Moyer
Post by Chris Mason
Post by Jeff Moyer
Well, we have control over dm and md, so I'm not worried about that.
For the storage vendors, we'll have to see about influencing the
standards bodies.
1) minimum size that is atomic (likely the physical block size, but
maybe the logical block size?)
2) maximum size that is atomic (multiple of minimum size)
3) whether or not discontiguous ranges are supported
Did I miss anything?
It'll vary from vendor to vendor. A discontig range of two 512KB areas
is different from 256 distcontig 4KB areas.
Sure.
Post by Chris Mason
And it's completely dependent on filesystem fragmentation. So, a given
IO might pass for one file and fail for the next.
Worse, it could pass for one region of a file and fail for a different
region of the same file.
I guess you could export the most conservative estimate, based on
completely non-contiguous smallest sized segments. Things larger may
work, but they may not. Perhaps this would be too limiting, I don't
know.
Depends on the workload. For mysql, they really only need ~16KB in the
default config. I'd rather not restrict things to that one case, but
it's pretty easy to satisfy.
Post by Jeff Moyer
Post by Chris Mason
In a DM/MD configuration, an atomic IO inside a single stripe on raid0
could succeed while it will fail if it spans two stripes to two
different devices.
I'd say that if you are spanning multiple devices, you don't support
O_ATOMIC. You could write a specific dm target that allows it, but I
don't think it's a priority to support it in the way your example does.
Given that there are applications using your implementation, what did
they determine was a sane way to do things? Only access the block
device? Preallocate files? Fallback to non-atomic writes + fsync?
Something else?
Admin opt-in on single drives only. mysql exits with errors if the
atomics aren't supported.

-chris

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Dave Chinner
2013-11-13 23:59:14 UTC
Permalink
Post by Jeff Moyer
Post by Chris Mason
Post by Jeff Moyer
Well, we have control over dm and md, so I'm not worried about that.
For the storage vendors, we'll have to see about influencing the
standards bodies.
1) minimum size that is atomic (likely the physical block size, but
maybe the logical block size?)
2) maximum size that is atomic (multiple of minimum size)
3) whether or not discontiguous ranges are supported
Did I miss anything?
It'll vary from vendor to vendor. A discontig range of two 512KB areas
is different from 256 distcontig 4KB areas.
Sure.
Post by Chris Mason
And it's completely dependent on filesystem fragmentation. So, a given
IO might pass for one file and fail for the next.
Worse, it could pass for one region of a file and fail for a different
region of the same file.
I guess you could export the most conservative estimate, based on
completely non-contiguous smallest sized segments. Things larger may
work, but they may not. Perhaps this would be too limiting, I don't
know.
Post by Chris Mason
In a DM/MD configuration, an atomic IO inside a single stripe on raid0
could succeed while it will fail if it spans two stripes to two
different devices.
I'd say that if you are spanning multiple devices, you don't support
O_ATOMIC. You could write a specific dm target that allows it, but I
don't think it's a priority to support it in the way your example does.
I would have thought this would be pretty simple to do - just
journal the atomic write so it can be recovered in full if there is
a power failure.

Indeed, what I'd really like to be able to do from a filesystem
perspective is to be able to issue a group of related metadata IO as
an atomic write rather than marshaling it through a journal and then
issuing them as unrelated IO. If we have a special dm-target
underneath that can either issue it as an atomic write (if the
hardware supports it) or emulate it via a journal to maintain
multi-device atomicity requirements then we end up with a general
atomic write solution that filesystems can then depend on.

Once we have guaranteed support for atomic writes, then we can
completely remove journalling from filesystem transaction engines
as the atomicity requirements can be met with atomic writes. An then
we can optimise things like fsync() for atomic writes.

IOWs, generic support for atomic writes will make a major difference
to filesystem algorithms. Hence, from my perspective, at this early
point in the development lifecycle having guaranteed atomic write
support via emulation is far more important than actually having
hardware that supports it... :)
Post by Jeff Moyer
Given that there are applications using your implementation, what did
they determine was a sane way to do things? Only access the block
device? Preallocate files? Fallback to non-atomic writes + fsync?
Something else?
Avoiding these problems is, IMO, another goof reason for having
generic, transparent support for atomic writes built into the IO
stack....

Cheers,

Dave.
--
Dave Chinner
***@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Matthew Wilcox
2013-11-12 15:11:51 UTC
Permalink
Post by Chris Mason
Unfortunately, it's hard to say. I think the fusionio cards are the
only shipping devices that support this, but I've definitely heard that
others plan to support it as well. mariadb/percona already support the
atomics via fusionio specific ioctls, and turning that into a real
O_ATOMIC is a priority so other hardware can just hop on the train.
This feature in general is pretty natural for the log structured squirrels
they stuff inside flash, so I'd expect everyone to support it. Matthew,
how do you feel about all of this?
NVMe doesn't have support for this functionality. I know what stories I've
heard from our internal device teams about what they can and can't support
in the way of this kind of thing, but I obviously can't repeat them here!

I took a look at the SCSI Block Command spec. If I understand it
correctly, SCSI would implement this with the WRITE USING TOKEN command.
I don't see why it couldn't implement this API, though it seems like
SCSI would prefer a separate setup step before the write comes in. I'm
not sure that's a reasonable request to make of the application (nor
am I sure I understand SBC correctly).

I like the API, but I'm a little confused not to see a patch saying "Oh,
and here's how we implemented it in btrfs without any hardware support"
;-) It seems to me that the concept is just as good a match for an
advanced filesystem that supports snapshots as it is for the FTL inside
a drive.
Post by Chris Mason
With the fusionio drivers, we've recently increased the max atomic size.
It's basically 1MB, disjoint or contig doesn't matter. We're powercut
safe at 1MB.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Chris Mason
2013-11-13 20:44:38 UTC
Permalink
Quoting Matthew Wilcox (2013-11-12 10:11:51)
Post by Matthew Wilcox
Post by Chris Mason
Unfortunately, it's hard to say. I think the fusionio cards are the
only shipping devices that support this, but I've definitely heard that
others plan to support it as well. mariadb/percona already support the
atomics via fusionio specific ioctls, and turning that into a real
O_ATOMIC is a priority so other hardware can just hop on the train.
This feature in general is pretty natural for the log structured squirrels
they stuff inside flash, so I'd expect everyone to support it. Matthew,
how do you feel about all of this?
NVMe doesn't have support for this functionality. I know what stories I've
heard from our internal device teams about what they can and can't support
in the way of this kind of thing, but I obviously can't repeat them here!
There are some atomics in the NVMe spec though, correct? The minimum
needed for database use is only ~16-64K.
Post by Matthew Wilcox
I took a look at the SCSI Block Command spec. If I understand it
correctly, SCSI would implement this with the WRITE USING TOKEN command.
I don't see why it couldn't implement this API, though it seems like
SCSI would prefer a separate setup step before the write comes in. I'm
not sure that's a reasonable request to make of the application (nor
am I sure I understand SBC correctly).
What kind of setup would we have to do? We have all the IO in hand, so
it can be organized in just about any way needed.
Post by Matthew Wilcox
I like the API, but I'm a little confused not to see a patch saying "Oh,
and here's how we implemented it in btrfs without any hardware support"
;-) It seems to me that the concept is just as good a match for an
advanced filesystem that supports snapshots as it is for the FTL inside
a drive.
Grin, almost Btrfs already does this...COW means that btrfs needs to
update metadata to point to new locations. To avoid an ugly
flush-all-the-io-every-commit mess, we track pending writes and update
the meatadata when the write is fully on media.

We're missing a firm line that makes sure all the metadata updates for a
single write happen in the same transaction, but that part isn't hard.

We're missing good performance in database workloads, which is a
slightly bigger trick.

-chris

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Howard Chu
2013-11-13 20:53:48 UTC
Permalink
Post by Chris Mason
Quoting Matthew Wilcox (2013-11-12 10:11:51)
Post by Matthew Wilcox
I took a look at the SCSI Block Command spec. If I understand it
correctly, SCSI would implement this with the WRITE USING TOKEN command.
I don't see why it couldn't implement this API, though it seems like
SCSI would prefer a separate setup step before the write comes in. I'm
not sure that's a reasonable request to make of the application (nor
am I sure I understand SBC correctly).
What kind of setup would we have to do? We have all the IO in hand, so
it can be organized in just about any way needed.
Post by Matthew Wilcox
I like the API, but I'm a little confused not to see a patch saying "Oh,
and here's how we implemented it in btrfs without any hardware support"
;-) It seems to me that the concept is just as good a match for an
advanced filesystem that supports snapshots as it is for the FTL inside
a drive.
Grin, almost Btrfs already does this...COW means that btrfs needs to
update metadata to point to new locations. To avoid an ugly
flush-all-the-io-every-commit mess, we track pending writes and update
the meatadata when the write is fully on media.
We're missing a firm line that makes sure all the metadata updates for a
single write happen in the same transaction, but that part isn't hard.
We're missing good performance in database workloads, which is a
slightly bigger trick.
This is precisely why this is needed:
http://www.spinics.net/lists/linux-fsdevel/msg70047.html
--
-- Howard Chu
CTO, Symas Corp. http://www.symas.com
Director, Highland Sun http://highlandsun.com/hyc/
Chief Architect, OpenLDAP http://www.openldap.org/project/
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Matthew Wilcox
2013-11-13 21:35:54 UTC
Permalink
Post by Chris Mason
Quoting Matthew Wilcox (2013-11-12 10:11:51)
Unfortunately, it's hard to say. I think the fusionio cards are =
the
Post by Chris Mason
only shipping devices that support this, but I've definitely hear=
d that
Post by Chris Mason
others plan to support it as well. mariadb/percona already suppo=
rt the
Post by Chris Mason
atomics via fusionio specific ioctls, and turning that into a rea=
l
Post by Chris Mason
O_ATOMIC is a priority so other hardware can just hop on the trai=
n.
Post by Chris Mason
=20
This feature in general is pretty natural for the log structured =
squirrels
Post by Chris Mason
they stuff inside flash, so I'd expect everyone to support it. M=
atthew,
Post by Chris Mason
how do you feel about all of this?
=20
NVMe doesn't have support for this functionality. I know what stor=
ies I've
Post by Chris Mason
heard from our internal device teams about what they can and can't =
support
Post by Chris Mason
in the way of this kind of thing, but I obviously can't repeat them=
here!
Post by Chris Mason
=20
There are some atomics in the NVMe spec though, correct? The minimum
needed for database use is only ~16-64K.
Yes, NVMe has limited atomic support. It has two fields:

Atomic Write Unit Normal (AWUN): This field indicates the atomic writ=
e
size for the controller during normal operation. This field is specif=
ied
in logical blocks and is a 0=E2=80=99s based value. If a write is sub=
mitted
of this size or less, the host is guaranteed that the write is atomic
to the NVM with respect to other read or write operations. If a write
is submitted that is greater than this size, there is no guarantee
of atomicity.

A value of FFFFh indicates all commands are atomic as this is the
largest command size. It is recommended that implementations support
a minimum of 128KB (appropriately scaled based on LBA size).


Atomic Write Unit Power Fail (AWUPF): This field indicates the atomic
write size for the controller during a power fail condition. This
field is specified in logical blocks and is a 0=E2=80=99s based value=
=2E If a
write is submitted of this size or less, the host is guaranteed that
the write is atomic to the NVM with respect to other read or write
operations. If a write is submitted that is greater than this size,
there is no guarantee of atomicity.


Basically just exposing what is assumed to be true for SCSI and general=
ly
assumed to be lies for ATA drives :-)
Post by Chris Mason
I took a look at the SCSI Block Command spec. If I understand it
correctly, SCSI would implement this with the WRITE USING TOKEN com=
mand.
Post by Chris Mason
I don't see why it couldn't implement this API, though it seems lik=
e
Post by Chris Mason
SCSI would prefer a separate setup step before the write comes in. =
I'm
Post by Chris Mason
not sure that's a reasonable request to make of the application (no=
r
Post by Chris Mason
am I sure I understand SBC correctly).
=20
What kind of setup would we have to do? We have all the IO in hand, =
so
Post by Chris Mason
it can be organized in just about any way needed.
Someone who understands SCSI better than I do assures me this is NOT th=
e
proposal that allows SCSI devices to do scattered writes. Apparently t=
hat
proposal is still in progress. This appears to be true; from the t10
NEW list:

12-087r6 SBC-4 Gathered reads, optionally atomic Rob Elliott, Ashish =
Batwara, Walt Hubis Missing=09
12-086r6 SBC-4 SPC-5 Scattered writes, optionally atomic Rob Elliott,=
Ashish Batwara, Walt Hubis Missing
Post by Chris Mason
Grin, almost Btrfs already does this...COW means that btrfs needs to
update metadata to point to new locations. To avoid an ugly
flush-all-the-io-every-commit mess, we track pending writes and updat=
e
Post by Chris Mason
the meatadata when the write is fully on media.
=20
We're missing a firm line that makes sure all the metadata updates fo=
r a
Post by Chris Mason
single write happen in the same transaction, but that part isn't hard=
=2E
Post by Chris Mason
=20
We're missing good performance in database workloads, which is a
slightly bigger trick.
Yeah ... if only you could find a database company to ... oh, wait :-)

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel=
" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Chris Mason
2013-11-01 21:29:44 UTC
Permalink
This adds the O_ATOMIC file flag (which requires O_DIRECT). If
applications request atomic IO, the generic O_DIRECT code is changed to
build a list of bios to represent any single O_DIRECT write() call. The
bios may span discontig areas of the drive if the file is fragmented.

The bios are sent to submit_bio as a single unit, and we expect the
storage to do one of three things:

Fail each bio individually if the list is too large for atomic
completion.

Fail each bio individually if there are any errors during any write.

Complete each bio with success if every write is fully stable
on media.

This works with any filesystem that uses the generic O_DIRECT code for
bio submission (almost everyone except Btrfs).

Signed-off-by: Chris Mason <***@fusionio.com>
---
fs/direct-io.c | 23 +++++++++++++++++++++--
fs/fcntl.c | 14 +++++++++++---
include/uapi/asm-generic/fcntl.h | 4 ++++
3 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 160a548..6837418 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -120,6 +120,7 @@ struct dio {
struct inode *inode;
loff_t i_size; /* i_size when submitted */
dio_iodone_t *end_io; /* IO completion function */
+ struct bio_list atomic_bio;

void *private; /* copy from map_bh.b_private */

@@ -409,14 +410,30 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
if (sdio->submit_io)
sdio->submit_io(dio->rw, bio, dio->inode,
sdio->logical_offset_in_bio);
- else
- submit_bio(dio->rw, bio);
+ else {
+ /* atomic writes are collected for submission together */
+ if (dio->rw != READ &&
+ (dio->iocb->ki_filp->f_flags & O_ATOMIC)) {
+ bio->bi_rw |= (REQ_ATOMIC | dio->rw);
+ bio_list_add(&dio->atomic_bio, bio);
+ } else {
+ /* everything else is sent directly */
+ submit_bio(dio->rw, bio);
+ }
+ }

sdio->bio = NULL;
sdio->boundary = 0;
sdio->logical_offset_in_bio = 0;
}

+static inline void dio_bio_atomic_submit(struct dio *dio)
+{
+ struct bio *bio = bio_list_get(&dio->atomic_bio);
+ if (bio)
+ submit_bio(dio->rw, bio);
+}
+
/*
* Release any resources in case of a failure
*/
@@ -1173,6 +1190,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
* care to only zero out what's needed.
*/
memset(dio, 0, offsetof(struct dio, pages));
+ bio_list_init(&dio->atomic_bio);

dio->flags = flags;
if (dio->flags & DIO_LOCKING) {
@@ -1318,6 +1336,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (sdio.bio)
dio_bio_submit(dio, &sdio);

+ dio_bio_atomic_submit(dio);
blk_finish_plug(&plug);

/*
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 65343c3..09f4c7a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -26,7 +26,8 @@
#include <asm/siginfo.h>
#include <asm/uaccess.h>

-#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
+#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | \
+ O_DIRECT | O_NOATIME | O_ATOMIC)

static int setfl(int fd, struct file * filp, unsigned long arg)
{
@@ -56,6 +57,12 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
return -EINVAL;
}

+ /* O_ATOMIC requires O_DIRECT */
+ if (arg & O_ATOMIC) {
+ if (!((arg | filp->f_flags) & O_DIRECT))
+ return -EINVAL;
+ }
+
if (filp->f_op && filp->f_op->check_flags)
error = filp->f_op->check_flags(arg);
if (error)
@@ -730,14 +737,15 @@ static int __init fcntl_init(void)
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
* is defined as O_NONBLOCK on some platforms and not on others.
*/
- BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+ BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
O_RDONLY | O_WRONLY | O_RDWR |
O_CREAT | O_EXCL | O_NOCTTY |
O_TRUNC | O_APPEND | /* O_NONBLOCK | */
__O_SYNC | O_DSYNC | FASYNC |
O_DIRECT | O_LARGEFILE | O_DIRECTORY |
O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
- __FMODE_EXEC | O_PATH | __O_TMPFILE
+ __FMODE_EXEC | O_PATH | __O_TMPFILE |
+ O_ATOMIC
));

fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 95e46c8..00259df 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -88,6 +88,10 @@
#define __O_TMPFILE 020000000
#endif

+#ifndef O_ATOMIC
+#define O_ATOMIC 040000000 /* set do atomic O_DIRECT writes */
+#endif
+
/* a horrid kludge trying to make sure that this will fail on old kernels */
#define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
#define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT)
--
1.8.2

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Continue reading on narkive:
Loading...