Discussion:
[RFC PATCH 1/2] fs: Add dirreadahead syscall and VFS hooks
(too old to reply)
Abhi Das
2014-07-25 17:37:30 UTC
Permalink
Raw Message
Also adds a void *opaque field to struct dir_context that can be
used by filesystems to temporarily store any context as this
struct gets passed around in the fs.

Signed-off-by: Abhi Das <***@redhat.com>
---
arch/x86/syscalls/syscall_32.tbl | 1 +
arch/x86/syscalls/syscall_64.tbl | 1 +
fs/readdir.c | 49 ++++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 3 +++
4 files changed, 54 insertions(+)

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index d6b8679..3e0ef85 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -360,3 +360,4 @@
351 i386 sched_setattr sys_sched_setattr
352 i386 sched_getattr sys_sched_getattr
353 i386 renameat2 sys_renameat2
+354 i386 dirreadahead sys_dirreadahead
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index ec255a1..2ec0991 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -323,6 +323,7 @@
314 common sched_setattr sys_sched_setattr
315 common sched_getattr sys_sched_getattr
316 common renameat2 sys_renameat2
+317 common dirreadahead sys_dirreadahead

#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/readdir.c b/fs/readdir.c
index 33fd922..d216db7 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -198,6 +198,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
struct linux_dirent __user * lastdirent;
struct getdents_callback buf = {
.ctx.actor = filldir,
+ .ctx.opaque = NULL,
.count = count,
.current_dir = dirent
};
@@ -278,6 +279,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
struct linux_dirent64 __user * lastdirent;
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
+ .ctx.opaque = NULL,
.count = count,
.current_dir = dirent
};
@@ -304,3 +306,50 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
fdput(f);
return error;
}
+
+SYSCALL_DEFINE3(dirreadahead, unsigned int, fd,
+ loff_t __user *, offset, unsigned int, count)
+{
+ struct fd f;
+ struct inode *inode;
+ int error = -ENOTDIR;
+ loff_t off = 0;
+ struct dir_context ctx = {.actor = NULL, .opaque = NULL};
+
+ if (!count)
+ return -EINVAL;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ inode = f.file->f_path.dentry->d_inode;
+
+ error = -ENOTSUPP;
+ if (!f.file->f_op || !f.file->f_op->dir_readahead)
+ goto out;
+
+ error = security_file_permission(f.file, MAY_READ);
+ if (error)
+ goto out;
+
+ error = -EFAULT;
+ if (__get_user(ctx.pos, offset))
+ goto out;
+
+ error = mutex_lock_killable(&inode->i_mutex);
+ if (error)
+ goto out;
+
+ error = -ENOENT;
+ if (!IS_DEADDIR(inode)) {
+ error = f.file->f_op->dir_readahead(f.file, &ctx, count);
+ if (__put_user(ctx.pos, offset))
+ error = -EFAULT;
+ file_accessed(f.file);
+ }
+ mutex_unlock(&inode->i_mutex);
+out:
+ fdput(f);
+ return error;
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 338e6f7..fae4a6e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1438,9 +1438,11 @@ int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
* to have different dirent layouts depending on the binary type.
*/
typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
+
struct dir_context {
const filldir_t actor;
loff_t pos;
+ void *opaque;
};

struct block_device_operations;
@@ -1463,6 +1465,7 @@ struct file_operations {
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iterate) (struct file *, struct dir_context *);
+ int (*dir_readahead) (struct file *, struct dir_context *, unsigned int);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
--
1.8.1.4
Abhi Das
2014-07-25 17:37:31 UTC
Permalink
Raw Message
This patch adds support to GFS2 for the dirreadahead system call.

There's a hard limit (128K) on the number of inodes that can be
readahead at one time. There's also a memory cap on the number of
inode numbers we collect. We readahead whatever number of inodes
we have collected until the first of these two caps is hit.

Readahead is done in two stages. In the intial stage, the
directory is read through and all the inode numbers of its entries
are collected. In the second stage, readaheads are performed
asynchronously using workqueues, so the syscall can return to the
user at this stage.
Subsequent getdents calls on the directory and stat calls on the
inodes will have the time-consuming lookups already done for them
and will therefore be quick.

Signed-off-by: Abhi Das <***@redhat.com>
---
fs/gfs2/Makefile | 3 +-
fs/gfs2/dir.c | 49 +++++++++---
fs/gfs2/dir.h | 15 ++++
fs/gfs2/dir_readahead.c | 209 ++++++++++++++++++++++++++++++++++++++++++++++++
fs/gfs2/file.c | 2 +
fs/gfs2/main.c | 10 ++-
fs/gfs2/super.c | 1 +
7 files changed, 274 insertions(+), 15 deletions(-)
create mode 100644 fs/gfs2/dir_readahead.c

diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 8612820..2765c83 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -4,7 +4,8 @@ gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
glops.o log.o lops.o main.o meta_io.o \
aops.o dentry.o export.o file.o \
ops_fstype.o inode.o quota.o \
- recovery.o rgrp.o super.o sys.o trans.o util.o
+ recovery.o rgrp.o super.o sys.o trans.o \
+ dir_readahead.o util.o

gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1a349f9..f068763 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1217,6 +1217,20 @@ static int compare_dents(const void *a, const void *b)
return ret;
}

+static int gfs2_dirent_dot_or_dotdot(const struct gfs2_dirent *dent)
+{
+ const char *name = (char *)(dent + 1);
+
+ if (be16_to_cpu(dent->de_type) == DT_DIR) {
+ if (be16_to_cpu(dent->de_name_len) == 1 && name[0] == '.')
+ return 1;
+ if (be16_to_cpu(dent->de_name_len) == 2 &&
+ strncmp(name, "..", 2) == 0)
+ return 1;
+ }
+ return 0;
+}
+
/**
* do_filldir_main - read out directory entries
* @dip: The GFS2 inode
@@ -1262,8 +1276,12 @@ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
ctx->pos = off;

if (off_next == off) {
- if (*copied && !run)
+ if (*copied && !run) {
+ struct gfs2_dir_ra *d_ra = ctx->opaque;
+ if (d_ra)
+ set_bit(RA_FL_HASHCOLL, &d_ra->flags);
return 1;
+ }
run = 1;
} else
run = 0;
@@ -1273,11 +1291,18 @@ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
ctx->pos = off;
}

- if (!dir_emit(ctx, (const char *)(dent + 1),
- be16_to_cpu(dent->de_name_len),
- be64_to_cpu(dent->de_inum.no_addr),
- be16_to_cpu(dent->de_type)))
- return 1;
+ if (ctx->actor) {
+ if (!dir_emit(ctx, (const char *)(dent + 1),
+ be16_to_cpu(dent->de_name_len),
+ be64_to_cpu(dent->de_inum.no_addr),
+ be16_to_cpu(dent->de_type)))
+ return 1;
+ } else { /* we were called by dir_readahead */
+ if (gfs2_dirent_dot_or_dotdot(dent))
+ continue;
+ if (collect_inode_blocks(ctx, be64_to_cpu(dent->de_inum.no_addr)))
+ return 1;
+ }

*copied = 1;
}
@@ -1311,8 +1336,7 @@ static void gfs2_free_sort_buffer(void *ptr)
}

static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
- int *copied, unsigned *depth,
- u64 leaf_no)
+ int *copied, unsigned *depth, u64 leaf_no)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1399,14 +1423,14 @@ out:
}

/**
- * gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
+ * gfs2_dir_leaf_ra - Issue read-ahead requests for leaf blocks.
*
* Note: we can't calculate each index like dir_e_read can because we don't
* have the leaf, and therefore we don't have the depth, and therefore we
* don't have the length. So we have to just read enough ahead to make up
* for the loss of information.
*/
-static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
+static void gfs2_dir_leaf_ra(struct inode *inode, unsigned hsize, u32 index,
struct file_ra_state *f_ra)
{
struct gfs2_inode *ip = GFS2_I(inode);
@@ -1474,11 +1498,10 @@ static int dir_e_read(struct inode *inode, struct dir_context *ctx,
if (IS_ERR(lp))
return PTR_ERR(lp);

- gfs2_dir_readahead(inode, hsize, index, f_ra);
+ gfs2_dir_leaf_ra(inode, hsize, index, f_ra);

while (index < hsize) {
- error = gfs2_dir_read_leaf(inode, ctx,
- &copied, &depth,
+ error = gfs2_dir_read_leaf(inode, ctx, &copied, &depth,
be64_to_cpu(lp[index]));
if (error)
break;
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 126c65d..429eea9 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -25,6 +25,21 @@ struct gfs2_diradd {
struct buffer_head *bh;
};

+extern struct workqueue_struct *gfs2_dir_ra_wq;
+#define RA_MAX_INOS 131072 /*128K */
+#define RA_FL_HASHCOLL 1
+
+struct gfs2_dir_ra {
+ u64 *inos;
+ size_t size;
+ size_t count;
+ unsigned int req;
+ unsigned long flags;
+};
+
+extern int gfs2_dir_readahead(struct file *file, struct dir_context *ctx,
+ unsigned int count);
+extern int collect_inode_blocks(struct dir_context *ctx, u64 offset);
extern struct inode *gfs2_dir_search(struct inode *dir,
const struct qstr *filename,
bool fail_on_exist);
diff --git a/fs/gfs2/dir_readahead.c b/fs/gfs2/dir_readahead.c
new file mode 100644
index 0000000..98888ad
--- /dev/null
+++ b/fs/gfs2/dir_readahead.c
@@ -0,0 +1,209 @@
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/sort.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/falloc.h>
+#include <linux/swap.h>
+#include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <asm/uaccess.h>
+#include <linux/dlm.h>
+#include <linux/dlm_plock.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+struct workqueue_struct *gfs2_dir_ra_wq;
+
+static int compare_inos(const void *a, const void *b)
+{
+ u64 ino_a, ino_b;
+
+ ino_a = *(u64 *)a;
+ ino_b = *(u64 *)b;
+
+ if (ino_a > ino_b)
+ return 1;
+ return -1;
+}
+
+static int collect_more(struct gfs2_dir_ra *d_ra)
+{
+ return (d_ra->count < d_ra->req &&
+ (d_ra->count * sizeof(u64)) < d_ra->size);
+}
+
+int collect_inode_blocks(struct dir_context *ctx, u64 ino)
+{
+ struct gfs2_dir_ra *d_ra = (struct gfs2_dir_ra *) ctx->opaque;
+
+ if (!collect_more(d_ra))
+ return 1; /* Collected requested blocks */
+
+ d_ra->inos[d_ra->count++] = ino;
+ return 0;
+}
+
+struct dir_ra_work {
+ struct work_struct work;
+ u64 ino;
+ struct gfs2_sbd *sdp;
+};
+
+static void dir_ra_work_func(struct work_struct *work)
+{
+ struct dir_ra_work *w = container_of(work, struct dir_ra_work, work);
+
+ /* XXX: What to do if sdp disappears by the time we get here? */
+ struct inode *inode = gfs2_lookup_by_inum(w->sdp, w->ino, NULL,
+ GFS2_BLKST_DINODE);
+ if (IS_ERR(inode)) {
+ fs_err(w->sdp, "can't read in inode at addr:%llu: %ld\n",
+ w->ino, PTR_ERR(inode));
+ }
+ gfs2_inode_refresh(GFS2_I(inode));
+ iput(inode);
+ kfree(work);
+}
+
+int gfs2_queue_dir_ra(struct dir_context *ctx, struct gfs2_sbd *sdp)
+{
+ int i;
+ struct gfs2_dir_ra *d_ra = (struct gfs2_dir_ra *) ctx->opaque;
+
+ sort(d_ra->inos, d_ra->count, sizeof(u64), compare_inos, NULL);
+
+ for (i=0; i<d_ra->count; i++) {
+ struct dir_ra_work *w;
+
+ w = kmalloc(sizeof(struct dir_ra_work), GFP_NOFS | __GFP_NOWARN);
+ if (!w)
+ break;
+
+ w->ino = d_ra->inos[i];
+ w->sdp = sdp;
+ INIT_WORK(&w->work, dir_ra_work_func);
+ queue_work(gfs2_dir_ra_wq, &w->work);
+ }
+ if (!i)
+ return -ENOMEM;
+ if (i != d_ra->count)
+ ctx->pos = 0; /* Don't know the resume offset for a short RA */
+ return i;
+}
+
+static inline unsigned int compute_ra_bufsize(unsigned int count)
+{
+ unsigned int size = count * (sizeof(u64));
+
+ if (size > KMALLOC_MAX_SIZE)
+ return KMALLOC_MAX_SIZE;
+ if (size < KMALLOC_MIN_SIZE)
+ return KMALLOC_MIN_SIZE;
+
+ return size;
+}
+
+static int init_ra_context(struct gfs2_inode *ip, struct dir_context *ctx,
+ unsigned int count)
+{
+ unsigned int bufsize;
+ struct gfs2_dir_ra *d_ra = (struct gfs2_dir_ra *) ctx->opaque;
+
+ memset(d_ra, 0, sizeof(struct gfs2_dir_ra));
+ count = count > RA_MAX_INOS ? RA_MAX_INOS : count;
+ count = count > ip->i_entries ? ip->i_entries : count;
+
+ bufsize = compute_ra_bufsize(count);
+ d_ra->inos = kmalloc(bufsize, GFP_NOFS | __GFP_NOWARN);
+ if (!d_ra->inos)
+ return -ENOMEM;
+
+ d_ra->size = bufsize;
+ d_ra->req = count;
+
+ return 0;
+}
+
+static void uninit_ra_context(struct dir_context *ctx)
+{
+ struct gfs2_dir_ra *d_ra;
+
+ if (!ctx || !ctx->opaque)
+ return;
+ d_ra = (struct gfs2_dir_ra *) ctx->opaque;
+ if (d_ra->inos)
+ kfree(d_ra->inos);
+ memset(d_ra, 0, sizeof(struct gfs2_dir_ra));
+}
+/**
+ * gfs2_dir_readahead - GFS2's implementation of readdir readahead
+ * @file : The directory to be read from
+ * @ctx : Context contains buffer to collect inode numbers
+ *
+ * Readahead inode disk blocks (and extended attribute blocks if requested)
+ * of every directory entry
+ *
+ * Returns: +ve number: The number of entries for which readahead calls
+ * were issued
+ * -ve values: For error conditions
+ */
+int gfs2_dir_readahead(struct file *file, struct dir_context *ctx, unsigned int count)
+{
+ int error = -EINVAL;
+ struct inode *dir = file->f_mapping->host;
+ struct gfs2_inode *dip = GFS2_I(dir);
+ struct gfs2_holder d_gh;
+ struct gfs2_dir_ra d_ra;
+
+ if (!ctx)
+ goto out;
+
+ ctx->opaque = &d_ra;
+ error = init_ra_context(dip, ctx, count);
+ if (error)
+ goto out;
+
+ gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+ error = gfs2_glock_nq(&d_gh);
+ if (error) {
+ gfs2_holder_uninit(&d_gh);
+ goto uninit;
+ }
+
+retry:
+ error = gfs2_dir_read(dir, ctx, &file->f_ra);
+ if (test_bit(RA_FL_HASHCOLL, &d_ra.flags)) {
+ clear_bit(RA_FL_HASHCOLL, &d_ra.flags);
+ goto retry;
+ }
+
+ /* Pass the collected inos to the workqueues to be read ahead */
+ if (d_ra.count)
+ error = gfs2_queue_dir_ra(ctx, GFS2_SB(dir));
+
+ gfs2_glock_dq_uninit(&d_gh);
+uninit:
+ uninit_ra_context(ctx);
+out:
+ return error;
+}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 26b3f95..6135bb9 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1075,6 +1075,7 @@ const struct file_operations gfs2_file_fops = {

const struct file_operations gfs2_dir_fops = {
.iterate = gfs2_readdir,
+ .dir_readahead = gfs2_dir_readahead,
.unlocked_ioctl = gfs2_ioctl,
.open = gfs2_open,
.release = gfs2_release,
@@ -1105,6 +1106,7 @@ const struct file_operations gfs2_file_fops_nolock = {

const struct file_operations gfs2_dir_fops_nolock = {
.iterate = gfs2_readdir,
+ .dir_readahead = gfs2_dir_readahead,
.unlocked_ioctl = gfs2_ioctl,
.open = gfs2_open,
.release = gfs2_release,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 82b6ac8..71e8ce5 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -161,9 +161,14 @@ static int __init init_gfs2_fs(void)
if (!gfs2_control_wq)
goto fail_recovery;

+ gfs2_dir_ra_wq = alloc_workqueue("gfs2_dir_ra",
+ WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
+ if (!gfs2_dir_ra_wq)
+ goto fail_control;
+
gfs2_page_pool = mempool_create_page_pool(64, 0);
if (!gfs2_page_pool)
- goto fail_control;
+ goto fail_ra;

gfs2_register_debugfs();

@@ -171,6 +176,8 @@ static int __init init_gfs2_fs(void)

return 0;

+fail_ra:
+ destroy_workqueue(gfs2_dir_ra_wq);
fail_control:
destroy_workqueue(gfs2_control_wq);
fail_recovery:
@@ -224,6 +231,7 @@ static void __exit exit_gfs2_fs(void)
unregister_filesystem(&gfs2meta_fs_type);
destroy_workqueue(gfs_recovery_wq);
destroy_workqueue(gfs2_control_wq);
+ destroy_workqueue(gfs2_dir_ra_wq);
list_lru_destroy(&gfs2_qd_lru);

rcu_barrier();
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 669e92e..8654636 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -849,6 +849,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
kthread_stop(sdp->sd_quotad_process);
kthread_stop(sdp->sd_logd_process);

+ flush_workqueue(gfs2_dir_ra_wq);
flush_workqueue(gfs2_delete_workqueue);
gfs2_quota_sync(sdp->sd_vfs, 0);
gfs2_statfs_sync(sdp->sd_vfs, 0);
--
1.8.1.4
Andreas Dilger
2014-07-26 05:27:19 UTC
Permalink
Raw Message
Is there a time when this doesn't get called to prefetch entries in
readdir() order? It isn't clear to me what benefit there is of returning
the entries to userspace instead of just doing the statahead implicitly
in the kernel?

The Lustre client has had what we call "statahead" for a while,
and similar to regular file readahead it detects the sequential access
pattern for readdir() + stat() in readdir() order (taking into account if ".*"
entries are being processed or not) and starts fetching the inode
attributes asynchronously with a worker thread.

This syscall might be more useful if userspace called readdir() to get
the dirents and then passed the kernel the list of inode numbers
to prefetch before starting on the stat() calls. That way, userspace
could generate an arbitrary list of inodes (e.g. names matching a
regexp) and the kernel doesn't need to guess if every inode is needed.

As it stands, this syscall doesn't help in anything other than readdir
order (or of the directory is small enough to be handled in one
syscall), which could be handled by the kernel internally already,
and it may fetch a considerable number of extra inodes from
disk if not every inode needs to be touched.

Cheers, Andreas
fd - file descriptor of the directory being readahead
*offset - offset in dir from which to resume. This is updated
as we move along in the directory
count - The max number of entries to readahead
The syscall is supposed to read upto 'count' entries starting at
'*offset' and cache the inodes corresponding to those entries. It
returns a negative error code or a positive number indicating
the number of inodes it has issued readaheads for. It also
updates the '*offset' value so that repeated calls to dirreadahead
can resume at the right location. Returns 0 when there are no more
entries left.
fs: Add dirreadahead syscall and VFS hooks
gfs2: GFS2's implementation of the dir_readahead file operation
arch/x86/syscalls/syscall_32.tbl | 1 +
arch/x86/syscalls/syscall_64.tbl | 1 +
fs/gfs2/Makefile | 3 +-
fs/gfs2/dir.c | 49 ++++++---
fs/gfs2/dir.h | 15 +++
fs/gfs2/dir_readahead.c | 209 +++++++++++++++++++++++++++++++++++++++
fs/gfs2/file.c | 2 +
fs/gfs2/main.c | 10 +-
fs/gfs2/super.c | 1 +
fs/readdir.c | 49 +++++++++
include/linux/fs.h | 3 +
11 files changed, 328 insertions(+), 15 deletions(-)
create mode 100644 fs/gfs2/dir_readahead.c
--
1.8.1.4
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
More majordomo info at http://vger.kernel.org/majordomo-info.html
Abhijith Das
2014-07-28 12:52:14 UTC
Permalink
Raw Message
----- Original Message -----
Sent: Saturday, July 26, 2014 12:27:19 AM
Subject: Re: [RFC PATCH 0/2] dirreadahead system call
Is there a time when this doesn't get called to prefetch entries in
readdir() order? It isn't clear to me what benefit there is of returning
the entries to userspace instead of just doing the statahead implicitly
in the kernel?
The Lustre client has had what we call "statahead" for a while,
and similar to regular file readahead it detects the sequential access
pattern for readdir() + stat() in readdir() order (taking into account if ".*"
entries are being processed or not) and starts fetching the inode
attributes asynchronously with a worker thread.
Does this heuristic work well in practice? In the use case we were trying to
address, a Samba server is aware beforehand if it is going to stat all the
inodes in a directory.
This syscall might be more useful if userspace called readdir() to get
the dirents and then passed the kernel the list of inode numbers
to prefetch before starting on the stat() calls. That way, userspace
could generate an arbitrary list of inodes (e.g. names matching a
regexp) and the kernel doesn't need to guess if every inode is needed.
Were you thinking arbitrary inodes across the filesystem or just a subset
from a directory? Arbitrary inodes may potentially throw up locking issues.
But yeah, as Steve mentioned in a previous email, limiting the inodes
readahead in some fashion other than a range in readdir() order is something
that we are thinking of (list of inodes based on regexps, filenames etc). We
just chose to do an offset range of the directory for a quick, early
implementation.
As it stands, this syscall doesn't help in anything other than readdir
order (or of the directory is small enough to be handled in one
syscall), which could be handled by the kernel internally already,
and it may fetch a considerable number of extra inodes from
disk if not every inode needs to be touched.
The need for this syscall came up from a specific use case - Samba. I'm told
that Windows clients like to stat every file in a directory as soon as it is
read in and this has been a slow operation.

Cheers!
--Abhi
Andreas Dilger
2014-07-28 21:19:31 UTC
Permalink
Raw Message
Post by Abhijith Das
Post by Andreas Dilger
Is there a time when this doesn't get called to prefetch entries in
readdir() order? It isn't clear to me what benefit there is of returning
the entries to userspace instead of just doing the statahead implicitly
in the kernel?
The Lustre client has had what we call "statahead" for a while,
and similar to regular file readahead it detects the sequential access
pattern for readdir() + stat() in readdir() order (taking into account if ".*"
entries are being processed or not) and starts fetching the inode
attributes asynchronously with a worker thread.
Does this heuristic work well in practice? In the use case we were trying to
address, a Samba server is aware beforehand if it is going to stat all the
inodes in a directory.
Typically this works well for us, because this is done by the Lustre
client, so the statahead is hiding the network latency of the RPCs to
fetch attributes from the server. I imagine the same could be seen with
GFS2. I don't know if this approach would help very much for local
filesystems because the latency is low.
Post by Abhijith Das
Post by Andreas Dilger
This syscall might be more useful if userspace called readdir() to get
the dirents and then passed the kernel the list of inode numbers
to prefetch before starting on the stat() calls. That way, userspace
could generate an arbitrary list of inodes (e.g. names matching a
regexp) and the kernel doesn't need to guess if every inode is needed.
Were you thinking arbitrary inodes across the filesystem or just a subset
from a directory? Arbitrary inodes may potentially throw up locking issues.
I was thinking about inodes returned from readdir(), but the syscall
would be much more useful if it could handle arbitrary inodes. For
example, if directories are small then it may be more efficient to
aggregate inodes from multiple directories for each prefetch syscall.
I can't really think of any locking issues that could exist with
"arbitrary list of inodes" that couldn't be hit by having a directory
with hard links to the same list of inodes, so this is something that
needs to be handled by the filesystem anyway.

Since this would be an advisory syscall (i.e. it doesn't really
return anything and cannot guarantee that all the inodes will be in
memory), then if the filesystem is having trouble prefetching the
inodes (e.g. invalid inode number(s) or lock ordering or contention
issues) it could always bail out and leave it to stat() to actually
fetch the inodes into memory when accessed.

There is no way it would be sane to keep inodes locked in the kernel
after prefetch, in case the "stat" never arrives, so the best it can
do is cache the inodes in memory (on the client for network fs), and
it is possible cache pressure or lock contention drops them from cache.

There are always going to be race conditions even if limited to a
single directory (e.g. another client modifies the inode after calling
dirreadahead(), but before calling stat()) that need to be handled.

I think there are a lot of benefits that could be had by the generic
syscall, possibly similar to what XFS is doing with the "bulkstat"
interfaces that Dave always mentions. This would be much more so for
cases were you don't want to stat all of the entries in a directory.
Post by Abhijith Das
But yeah, as Steve mentioned in a previous email, limiting the inodes
readahead in some fashion other than a range in readdir() order is
something that we are thinking of (list of inodes based on regexps,
filenames etc). We just chose to do an offset range of the directory
for a quick, early implementation.
Post by Andreas Dilger
As it stands, this syscall doesn't help in anything other than readdir
order (or of the directory is small enough to be handled in one
syscall), which could be handled by the kernel internally already,
and it may fetch a considerable number of extra inodes from
disk if not every inode needs to be touched.
The need for this syscall came up from a specific use case - Samba.
I'm told that Windows clients like to stat every file in a directory
as soon as it is read in and this has been a slow operation.
Cheers, Andreas
Steven Whitehouse
2014-07-29 09:36:31 UTC
Permalink
Raw Message
Hi,
Post by Andreas Dilger
Post by Abhijith Das
Post by Andreas Dilger
Is there a time when this doesn't get called to prefetch entries in
readdir() order? It isn't clear to me what benefit there is of returning
the entries to userspace instead of just doing the statahead implicitly
in the kernel?
The Lustre client has had what we call "statahead" for a while,
and similar to regular file readahead it detects the sequential access
pattern for readdir() + stat() in readdir() order (taking into account if ".*"
entries are being processed or not) and starts fetching the inode
attributes asynchronously with a worker thread.
Does this heuristic work well in practice? In the use case we were trying to
address, a Samba server is aware beforehand if it is going to stat all the
inodes in a directory.
Typically this works well for us, because this is done by the Lustre
client, so the statahead is hiding the network latency of the RPCs to
fetch attributes from the server. I imagine the same could be seen with
GFS2. I don't know if this approach would help very much for local
filesystems because the latency is low.
Well GFS2 is more or less a local fs in the case of no contention. There
are issues that arise in the contention case, which is why it is very
important to ensure that there is as close to 100% chance of using the
inodes which are pre-fetched as possible. Thats why we've not yet tried
out algorithms that try to guess when the inodes will be required, since
one wrong guess can be quite costly in terms of lock bouncing between
nodes. However it does seem that the suggestions made in this thread
about using ->lookup in the directory as one way to gather information
may be a good way to go, since it will only be called for uncached
inodes and in addition it would be equally useful to any possible
operation which may follow the lookup.
Post by Andreas Dilger
Post by Abhijith Das
Post by Andreas Dilger
This syscall might be more useful if userspace called readdir() to get
the dirents and then passed the kernel the list of inode numbers
to prefetch before starting on the stat() calls. That way, userspace
could generate an arbitrary list of inodes (e.g. names matching a
regexp) and the kernel doesn't need to guess if every inode is needed.
Were you thinking arbitrary inodes across the filesystem or just a subset
from a directory? Arbitrary inodes may potentially throw up locking issues.
I was thinking about inodes returned from readdir(), but the syscall
would be much more useful if it could handle arbitrary inodes. For
example, if directories are small then it may be more efficient to
aggregate inodes from multiple directories for each prefetch syscall.
I can't really think of any locking issues that could exist with
"arbitrary list of inodes" that couldn't be hit by having a directory
with hard links to the same list of inodes, so this is something that
needs to be handled by the filesystem anyway.
Since this would be an advisory syscall (i.e. it doesn't really
return anything and cannot guarantee that all the inodes will be in
memory), then if the filesystem is having trouble prefetching the
inodes (e.g. invalid inode number(s) or lock ordering or contention
issues) it could always bail out and leave it to stat() to actually
fetch the inodes into memory when accessed.
There is no way it would be sane to keep inodes locked in the kernel
after prefetch, in case the "stat" never arrives, so the best it can
do is cache the inodes in memory (on the client for network fs), and
it is possible cache pressure or lock contention drops them from cache.
There are always going to be race conditions even if limited to a
single directory (e.g. another client modifies the inode after calling
dirreadahead(), but before calling stat()) that need to be handled.
I think there are a lot of benefits that could be had by the generic
syscall, possibly similar to what XFS is doing with the "bulkstat"
interfaces that Dave always mentions. This would be much more so for
cases were you don't want to stat all of the entries in a directory.
Yes, it probably would be possible to do a kind of bulk lookup like
that, although it would also be more complicated locking-wise if we
allowed arbitrary inode numbers, rather than doing it on a per directory
basis. I can see arguments for both approaches though,

Steve.
Dave Chinner
2014-07-31 04:49:09 UTC
Permalink
Raw Message
Post by Andreas Dilger
Post by Abhijith Das
Post by Andreas Dilger
Is there a time when this doesn't get called to prefetch entries in
readdir() order? It isn't clear to me what benefit there is of returning
the entries to userspace instead of just doing the statahead implicitly
in the kernel?
The Lustre client has had what we call "statahead" for a while,
and similar to regular file readahead it detects the sequential access
pattern for readdir() + stat() in readdir() order (taking into account if ".*"
entries are being processed or not) and starts fetching the inode
attributes asynchronously with a worker thread.
Does this heuristic work well in practice? In the use case we were trying to
address, a Samba server is aware beforehand if it is going to stat all the
inodes in a directory.
Typically this works well for us, because this is done by the Lustre
client, so the statahead is hiding the network latency of the RPCs to
fetch attributes from the server. I imagine the same could be seen with
GFS2. I don't know if this approach would help very much for local
filesystems because the latency is low.
Post by Abhijith Das
Post by Andreas Dilger
This syscall might be more useful if userspace called readdir() to get
the dirents and then passed the kernel the list of inode numbers
to prefetch before starting on the stat() calls. That way, userspace
could generate an arbitrary list of inodes (e.g. names matching a
regexp) and the kernel doesn't need to guess if every inode is needed.
Were you thinking arbitrary inodes across the filesystem or just a subset
from a directory? Arbitrary inodes may potentially throw up locking issues.
I was thinking about inodes returned from readdir(), but the syscall
would be much more useful if it could handle arbitrary inodes.
I'mnot sure we can do that. The only way to safely identify a
specific inode in the filesystem from userspace is via a filehandle.
Plain inode numbers are susceptible to TOCTOU race conditions that
the kernel cannot resolve. Also, lookup by inode number bypasses
directory access permissions, so is not something we would expose
to arbitrary unprivileged users.
Post by Andreas Dilger
There are always going to be race conditions even if limited to a
single directory (e.g. another client modifies the inode after calling
dirreadahead(), but before calling stat()) that need to be handled.
Unlink/realloc to a different directory with different access
permissions is the big issue.
Post by Andreas Dilger
I think there are a lot of benefits that could be had by the generic
syscall, possibly similar to what XFS is doing with the "bulkstat"
interfaces that Dave always mentions. This would be much more so for
cases were you don't want to stat all of the entries in a directory.
Bulkstat is not really suited to this - it gets it's speed
specifically by avoiding directory traversal to find inodes. Hence
it is a root-only operation because it bypasses directory based
access restrictions and hence is really only useful for
full-filesystem traversal operations like backups, defragmentation,
etc.

Bulkstat output also contains enough information to construct a
valid file handle in userspace and so access to inodes found via
bulkstat can be gained via the XFS open-by-handle interfaces. Again,
this bypasses permissions checking and hence is a root-only
operation. It does, however, avoid TOCTOU races because the open-by-handle
will fail if the inode is unlinked and reallocated between the
bulkstat call and the open-by-handle as the generation number in the
handle will no longer match that of the inode.

Cheers,

Dave.
--
Dave Chinner
***@fromorbit.com
Andreas Dilger
2014-07-31 11:19:45 UTC
Permalink
Raw Message
Post by Andreas Dilger
Post by Abhijith Das
Post by Andreas Dilger
Is there a time when this doesn't get called to prefetch entries in
readdir() order? It isn't clear to me what benefit there is of returning
the entries to userspace instead of just doing the statahead implicitly
in the kernel?
The Lustre client has had what we call "statahead" for a while,
and similar to regular file readahead it detects the sequential access
pattern for readdir() + stat() in readdir() order (taking into account if ".*"
entries are being processed or not) and starts fetching the inode
attributes asynchronously with a worker thread.
Does this heuristic work well in practice? In the use case we were trying to
address, a Samba server is aware beforehand if it is going to stat all the
inodes in a directory.
Typically this works well for us, because this is done by the Lustre
client, so the statahead is hiding the network latency of the RPCs to
fetch attributes from the server. I imagine the same could be seen with
GFS2. I don't know if this approach would help very much for local
filesystems because the latency is low.
Post by Abhijith Das
Post by Andreas Dilger
This syscall might be more useful if userspace called readdir() to get
the dirents and then passed the kernel the list of inode numbers
to prefetch before starting on the stat() calls. That way, userspace
could generate an arbitrary list of inodes (e.g. names matching a
regexp) and the kernel doesn't need to guess if every inode is needed.
Were you thinking arbitrary inodes across the filesystem or just a subset
from a directory? Arbitrary inodes may potentially throw up locking issues.
I was thinking about inodes returned from readdir(), but the syscall
would be much more useful if it could handle arbitrary inodes.
I'm not sure we can do that. The only way to safely identify a
specific inode in the filesystem from userspace is via a filehandle.
Plain inode numbers are susceptible to TOCTOU race conditions that
the kernel cannot resolve. Also, lookup by inode number bypasses
directory access permissions, so is not something we would expose
to arbitrary unprivileged users.
None of these issues are relevant in the API that I'm thinking about.
The syscall just passes the list of inode numbers to be prefetched
into kernel memory, and then stat() is used to actually get the data into
userspace (or whatever other operation is to be done on them),
so there is no danger if the wrong inode is prefetched. If the inode
number is bad the filesystem can just ignore it.
Post by Andreas Dilger
There are always going to be race conditions even if limited to a
single directory (e.g. another client modifies the inode after calling
dirreadahead(), but before calling stat()) that need to be handled.
Unlink/realloc to a different directory with different access
permissions is the big issue.
Post by Andreas Dilger
I think there are a lot of benefits that could be had by the generic
syscall, possibly similar to what XFS is doing with the "bulkstat"
interfaces that Dave always mentions. This would be much more so for
cases were you don't want to stat all of the entries in a directory.
Bulkstat is not really suited to this - it gets it's speed
specifically by avoiding directory traversal to find inodes. Hence
it is a root-only operation because it bypasses directory based
access restrictions and hence is really only useful for
full-filesystem traversal operations like backups, defragmentation,
etc.
Maybe it was a mistake to mention bulkstat in this discussion. The
stat() information would not actually be returned to memory in my
proposal, just the inodes would be loaded into memory in bulk so
that the later stat() (or unlink() or whatever) can go quickly.

Cheers, Andreas
Bulkstat output also contains enough information to construct a
valid file handle in userspace and so access to inodes found via
bulkstat can be gained via the XFS open-by-handle interfaces. Again,
this bypasses permissions checking and hence is a root-only
operation. It does, however, avoid TOCTOU races because the open-by-handle
will fail if the inode is unlinked and reallocated between the
bulkstat call and the open-by-handle as the generation number in the
handle will no longer match that of the inode.
Cheers,
Dave.
--
Dave Chinner
Dave Chinner
2014-07-31 23:53:06 UTC
Permalink
Raw Message
Post by Andreas Dilger
Post by Andreas Dilger
Post by Abhijith Das
Post by Andreas Dilger
Is there a time when this doesn't get called to prefetch entries in
readdir() order? It isn't clear to me what benefit there is of returning
the entries to userspace instead of just doing the statahead implicitly
in the kernel?
The Lustre client has had what we call "statahead" for a while,
and similar to regular file readahead it detects the sequential access
pattern for readdir() + stat() in readdir() order (taking into account if ".*"
entries are being processed or not) and starts fetching the inode
attributes asynchronously with a worker thread.
Does this heuristic work well in practice? In the use case we were trying to
address, a Samba server is aware beforehand if it is going to stat all the
inodes in a directory.
Typically this works well for us, because this is done by the Lustre
client, so the statahead is hiding the network latency of the RPCs to
fetch attributes from the server. I imagine the same could be seen with
GFS2. I don't know if this approach would help very much for local
filesystems because the latency is low.
Post by Abhijith Das
Post by Andreas Dilger
This syscall might be more useful if userspace called readdir() to get
the dirents and then passed the kernel the list of inode numbers
to prefetch before starting on the stat() calls. That way, userspace
could generate an arbitrary list of inodes (e.g. names matching a
regexp) and the kernel doesn't need to guess if every inode is needed.
Were you thinking arbitrary inodes across the filesystem or just a subset
from a directory? Arbitrary inodes may potentially throw up locking issues.
I was thinking about inodes returned from readdir(), but the syscall
would be much more useful if it could handle arbitrary inodes.
I'm not sure we can do that. The only way to safely identify a
specific inode in the filesystem from userspace is via a filehandle.
Plain inode numbers are susceptible to TOCTOU race conditions that
the kernel cannot resolve. Also, lookup by inode number bypasses
directory access permissions, so is not something we would expose
to arbitrary unprivileged users.
None of these issues are relevant in the API that I'm thinking about.
The syscall just passes the list of inode numbers to be prefetched
into kernel memory, and then stat() is used to actually get the data into
userspace (or whatever other operation is to be done on them),
so there is no danger if the wrong inode is prefetched. If the inode
number is bad the filesystem can just ignore it.
Which means the filesystem has to treat the inode number as
potentially hostile. i.e. it can not be trusted to be correct and so
must take slow paths to validate the inode numbers. This adds
*significant* overhead to the readahead path for some filesystems:
readahead is only a win if it is low cost.

For example, on XFS every untrusted inode number lookup requires an
inode btree lookup to validate the inode is actually valid on disk
and that is it allocated and has references. That lookup serialises
against inode allocation/freeing as well as other lookups. In
comparison, when using a trusted inode number from a directory
lookup within the kernel, we only need to do a couple of shift and
mask operations to convert it to a disk address and we are good to
go.

i.e. the difference is at least 5 orders of magnitude higher CPU usage
for an "inode number readahead" syscall versus a "directory
readahead" syscall, it has significant serialisation issues and it
can stall other modification/lookups going on at the same time.
That's *horrible behaviour* for a speculative readahead operation,
but because the inodenumbers are untrusted, we can't avoid it.

So, again, it's way more overhead than userspace just calling
stat() asycnhronously on many files at once as readdir/gentdents
returns dirents from the kernel to speed up cache population.

That's my main issue with this patchset - it's implementing
something in kernelspace that can *easily* be done generically in
userspace without introducing all sorts of nasty corner cases that
we have to handle in the kernel. We only add functionality to the kernel if there's a
compelling reason to do it in kernelspace, and right now I just
don't see any numbers that justify adding readdir+stat() readahead
or inode number based cache population in kernelspace.

Before we add *any* syscall for directory readahead, we need
comparison numbers against doing the dumb multithreaded
userspace readahead of stat() calls. If userspace can do this as
fast as the kernel can....

Cheers,

Dave.
--
Dave Chinner
***@fromorbit.com
Abhijith Das
2014-08-01 02:11:05 UTC
Permalink
Raw Message
----- Original Message -----
Sent: Thursday, July 31, 2014 6:53:06 PM
Subject: Re: [RFC PATCH 0/2] dirreadahead system call
Post by Andreas Dilger
Post by Andreas Dilger
Post by Abhijith Das
Post by Andreas Dilger
Is there a time when this doesn't get called to prefetch entries in
readdir() order? It isn't clear to me what benefit there is of returning
the entries to userspace instead of just doing the statahead implicitly
in the kernel?
The Lustre client has had what we call "statahead" for a while,
and similar to regular file readahead it detects the sequential access
pattern for readdir() + stat() in readdir() order (taking into account
if
".*"
entries are being processed or not) and starts fetching the inode
attributes asynchronously with a worker thread.
Does this heuristic work well in practice? In the use case we were trying to
address, a Samba server is aware beforehand if it is going to stat all the
inodes in a directory.
Typically this works well for us, because this is done by the Lustre
client, so the statahead is hiding the network latency of the RPCs to
fetch attributes from the server. I imagine the same could be seen with
GFS2. I don't know if this approach would help very much for local
filesystems because the latency is low.
Post by Abhijith Das
Post by Andreas Dilger
This syscall might be more useful if userspace called readdir() to get
the dirents and then passed the kernel the list of inode numbers
to prefetch before starting on the stat() calls. That way, userspace
could generate an arbitrary list of inodes (e.g. names matching a
regexp) and the kernel doesn't need to guess if every inode is needed.
Were you thinking arbitrary inodes across the filesystem or just a subset
from a directory? Arbitrary inodes may potentially throw up locking issues.
I was thinking about inodes returned from readdir(), but the syscall
would be much more useful if it could handle arbitrary inodes.
I'm not sure we can do that. The only way to safely identify a
specific inode in the filesystem from userspace is via a filehandle.
Plain inode numbers are susceptible to TOCTOU race conditions that
the kernel cannot resolve. Also, lookup by inode number bypasses
directory access permissions, so is not something we would expose
to arbitrary unprivileged users.
None of these issues are relevant in the API that I'm thinking about.
The syscall just passes the list of inode numbers to be prefetched
into kernel memory, and then stat() is used to actually get the data into
userspace (or whatever other operation is to be done on them),
so there is no danger if the wrong inode is prefetched. If the inode
number is bad the filesystem can just ignore it.
Which means the filesystem has to treat the inode number as
potentially hostile. i.e. it can not be trusted to be correct and so
must take slow paths to validate the inode numbers. This adds
readahead is only a win if it is low cost.
For example, on XFS every untrusted inode number lookup requires an
inode btree lookup to validate the inode is actually valid on disk
and that is it allocated and has references. That lookup serialises
against inode allocation/freeing as well as other lookups. In
comparison, when using a trusted inode number from a directory
lookup within the kernel, we only need to do a couple of shift and
mask operations to convert it to a disk address and we are good to
go.
i.e. the difference is at least 5 orders of magnitude higher CPU usage
for an "inode number readahead" syscall versus a "directory
readahead" syscall, it has significant serialisation issues and it
can stall other modification/lookups going on at the same time.
That's *horrible behaviour* for a speculative readahead operation,
but because the inodenumbers are untrusted, we can't avoid it.
So, again, it's way more overhead than userspace just calling
stat() asycnhronously on many files at once as readdir/gentdents
returns dirents from the kernel to speed up cache population.
That's my main issue with this patchset - it's implementing
something in kernelspace that can *easily* be done generically in
userspace without introducing all sorts of nasty corner cases that
we have to handle in the kernel. We only add functionality to the kernel if there's a
compelling reason to do it in kernelspace, and right now I just
don't see any numbers that justify adding readdir+stat() readahead
or inode number based cache population in kernelspace.
Before we add *any* syscall for directory readahead, we need
comparison numbers against doing the dumb multithreaded
userspace readahead of stat() calls. If userspace can do this as
fast as the kernel can....
This was next on my to-do list to see how an all-userspace solution compares
with doing this in the kernel. I'll post some results as soon as I can find
some time to play with this stuff again.

Cheers!
--Abhi
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andreas Dilger
2014-08-01 05:54:56 UTC
Permalink
Raw Message
Post by Dave Chinner
Post by Andreas Dilger
None of these issues are relevant in the API that I'm thinking about.
The syscall just passes the list of inode numbers to be prefetched
into kernel memory, and then stat() is used to actually get the data into
userspace (or whatever other operation is to be done on them),
so there is no danger if the wrong inode is prefetched. If the inode
number is bad the filesystem can just ignore it.
Which means the filesystem has to treat the inode number as
potentially hostile. i.e. it can not be trusted to be correct and so
must take slow paths to validate the inode numbers. This adds
readahead is only a win if it is low cost.
For example, on XFS every untrusted inode number lookup requires an
inode btree lookup to validate the inode is actually valid on disk
and that is it allocated and has references. That lookup serialises
against inode allocation/freeing as well as other lookups. In
comparison, when using a trusted inode number from a directory
lookup within the kernel, we only need to do a couple of shift and
mask operations to convert it to a disk address and we are good to
go.
i.e. the difference is at least 5 orders of magnitude higher CPU usage
for an "inode number readahead" syscall versus a "directory
readahead" syscall, it has significant serialisation issues and it
can stall other modification/lookups going on at the same time.
That's *horrible behaviour* for a speculative readahead operation,
but because the inodenumbers are untrusted, we can't avoid it.
For ext4 this is virtually free. The same needs to happen for any
inode number from NFS so it can't be that bad.

Also, since this API would be prefetching inodes in bulk it
could presumably optimize this to some extent.
Post by Dave Chinner
So, again, it's way more overhead than userspace just calling
stat() asycnhronously on many files at once as readdir/gentdents
returns dirents from the kernel to speed up cache population.
To me this seems like saying it is just as fast to submit hundreds of
256-byte random reads for a file as it is for large linear reads with
readahead. Yes, it is possible for the kernel to optimize the random
read workload to some extent, but not as easily as getting reads
in order in the first place.
Post by Dave Chinner
That's my main issue with this patchset - it's implementing
something in kernelspace that can *easily* be done generically in
userspace without introducing all sorts of nasty corner cases that
we have to handle in the kernel. We only add functionality to the kernel if there's a
compelling reason to do it in kernelspace, and right now I just
don't see any numbers that justify adding readdir+stat() readahead
or inode number based cache population in kernelspace.
The original patch showed there was definitely a significant win with
the prefetch case over the single threaded readdir+stat. There is
also something to be said for keeping complexity out of applications.
Sure it is possible for apps to get good performance from AIO,
but very few do so because of complexity.
Post by Dave Chinner
Before we add *any* syscall for directory readahead, we need
comparison numbers against doing the dumb multithreaded
userspace readahead of stat() calls. If userspace can do this as
fast as the kernel can....
I'd be interested to see this also, but my prediction is that this will not
deliver the kind of improvements you are expecting.

Cheers, Andreas--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Dave Chinner
2014-08-06 02:01:49 UTC
Permalink
Raw Message
Post by Andreas Dilger
Post by Dave Chinner
Post by Andreas Dilger
None of these issues are relevant in the API that I'm thinking about.
The syscall just passes the list of inode numbers to be prefetched
into kernel memory, and then stat() is used to actually get the data into
userspace (or whatever other operation is to be done on them),
so there is no danger if the wrong inode is prefetched. If the inode
number is bad the filesystem can just ignore it.
Which means the filesystem has to treat the inode number as
potentially hostile. i.e. it can not be trusted to be correct and so
must take slow paths to validate the inode numbers. This adds
readahead is only a win if it is low cost.
For example, on XFS every untrusted inode number lookup requires an
inode btree lookup to validate the inode is actually valid on disk
and that is it allocated and has references. That lookup serialises
against inode allocation/freeing as well as other lookups. In
comparison, when using a trusted inode number from a directory
lookup within the kernel, we only need to do a couple of shift and
mask operations to convert it to a disk address and we are good to
go.
i.e. the difference is at least 5 orders of magnitude higher CPU usage
for an "inode number readahead" syscall versus a "directory
readahead" syscall, it has significant serialisation issues and it
can stall other modification/lookups going on at the same time.
That's *horrible behaviour* for a speculative readahead operation,
but because the inodenumbers are untrusted, we can't avoid it.
For ext4 this is virtually free.
Yes, due to it's old-school fixed inode table design. But we aren't
designing functionality exclusely for ext4 - many filesystems do
dynamic inode allocation - and so we need to take the overhead of
inode number validation into account when discussing the
effectiveness of a generic readahead mechanism.
Post by Andreas Dilger
The same needs to happen for any
inode number from NFS so it can't be that bad.
NFS is RPC rate limited, not inode number validation rate limited -
the inode number lookup during file handle translation is not a
limiting factor. e.g. we might be able to do 50,000 NFS getattr RPCs
per second on a modern CPU when we hit the inode cache and 10% of
that CPU load will be file handle translation.

However, local filesystems might be able to do 500,000 stat() calls
with the same CPU from a local filesystem on cached inodes. Let's
now add readahead that uses 10% of a CPU over per 50,000 inode
numbers. Ignoring the overhead of the everything else but
filehandle translation, we run out of CPU at 500,000 inode number
lookups per second. So we burn up the entire CPU just doing inode
number validationand cache lookups, without actually doing any real
work. So adding inode number based readahead just slowed this
workload down by 50%...

Readahead is not always a win....
Post by Andreas Dilger
Also, since this API would be prefetching inodes in bulk it
could presumably optimize this to some extent.
So can userspace.
Post by Andreas Dilger
Post by Dave Chinner
So, again, it's way more overhead than userspace just calling
stat() asycnhronously on many files at once as readdir/gentdents
returns dirents from the kernel to speed up cache population.
To me this seems like saying it is just as fast to submit hundreds of
256-byte random reads for a file as it is for large linear reads with
readahead. Yes, it is possible for the kernel to optimize the random
read workload to some extent, but not as easily as getting reads
in order in the first place.
Strawman.

For ext4, optimised inode read ordering is as simple as dispatching
the IO in ascending inode number order. Same as for many other
filesystems that use direct indexed, fixed location tables for
inodes. And it's the same for XFS, where the inode number is a
direct encoding of it's location in the address space of the
underlying block device.

Seriously, Chris Mason proved years ago how much we can gain
from simple userspace optimisations years ago with his ACP tool.

https://oss.oracle.com/~mason/acp/

This doesn't even use threading - just orders all the stat() calls
and readahead() based on the inode number or a FIBMAP call - and it
shows how substantial the improvements can be from simple userspace
changes.
Post by Andreas Dilger
Post by Dave Chinner
That's my main issue with this patchset - it's implementing
something in kernelspace that can *easily* be done generically in
userspace without introducing all sorts of nasty corner cases that
we have to handle in the kernel. We only add functionality to the kernel if there's a
compelling reason to do it in kernelspace, and right now I just
don't see any numbers that justify adding readdir+stat() readahead
or inode number based cache population in kernelspace.
The original patch showed there was definitely a significant win with
the prefetch case over the single threaded readdir+stat.
Sure, but so is acp, hence showing what userspace optimisation gains
us before we even consider implementations using threading and/or
async stat() calls.
Post by Andreas Dilger
There is
also something to be said for keeping complexity out of applications.
Yes, that's why I've been saying it should be in glibc hidden behind
ftw() and friends so no application has to be changed to benefit
from the readahead on traversal workloads.
Post by Andreas Dilger
Sure it is possible for apps to get good performance from AIO,
but very few do so because of complexity.
True, but it's also true that few use AIO because they don't need
the few extra percent of performance that AIO can extract from the
system.
Post by Andreas Dilger
Post by Dave Chinner
Before we add *any* syscall for directory readahead, we need
comparison numbers against doing the dumb multithreaded
userspace readahead of stat() calls. If userspace can do this as
fast as the kernel can....
I'd be interested to see this also, but my prediction is that this will not
deliver the kind of improvements you are expecting.
Do you really think I'd be saying we need numbers if I didn't have
a good idea of what the numbers are going to say? ;)

Cheers,

Dave.
--
Dave Chinner
***@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Abhijith Das
2014-10-21 05:21:31 UTC
Permalink
Raw Message
----- Original Message -----
Sent: Thursday, July 31, 2014 6:53:06 PM
Subject: Re: [RFC PATCH 0/2] dirreadahead system call
Post by Andreas Dilger
Post by Andreas Dilger
Post by Abhijith Das
Post by Andreas Dilger
Is there a time when this doesn't get called to prefetch entries in
readdir() order? It isn't clear to me what benefit there is of returning
the entries to userspace instead of just doing the statahead implicitly
in the kernel?
The Lustre client has had what we call "statahead" for a while,
and similar to regular file readahead it detects the sequential access
pattern for readdir() + stat() in readdir() order (taking into account
if
".*"
entries are being processed or not) and starts fetching the inode
attributes asynchronously with a worker thread.
Does this heuristic work well in practice? In the use case we were trying to
address, a Samba server is aware beforehand if it is going to stat all the
inodes in a directory.
Typically this works well for us, because this is done by the Lustre
client, so the statahead is hiding the network latency of the RPCs to
fetch attributes from the server. I imagine the same could be seen with
GFS2. I don't know if this approach would help very much for local
filesystems because the latency is low.
Post by Abhijith Das
Post by Andreas Dilger
This syscall might be more useful if userspace called readdir() to get
the dirents and then passed the kernel the list of inode numbers
to prefetch before starting on the stat() calls. That way, userspace
could generate an arbitrary list of inodes (e.g. names matching a
regexp) and the kernel doesn't need to guess if every inode is needed.
Were you thinking arbitrary inodes across the filesystem or just a subset
from a directory? Arbitrary inodes may potentially throw up locking issues.
I was thinking about inodes returned from readdir(), but the syscall
would be much more useful if it could handle arbitrary inodes.
I'm not sure we can do that. The only way to safely identify a
specific inode in the filesystem from userspace is via a filehandle.
Plain inode numbers are susceptible to TOCTOU race conditions that
the kernel cannot resolve. Also, lookup by inode number bypasses
directory access permissions, so is not something we would expose
to arbitrary unprivileged users.
None of these issues are relevant in the API that I'm thinking about.
The syscall just passes the list of inode numbers to be prefetched
into kernel memory, and then stat() is used to actually get the data into
userspace (or whatever other operation is to be done on them),
so there is no danger if the wrong inode is prefetched. If the inode
number is bad the filesystem can just ignore it.
Which means the filesystem has to treat the inode number as
potentially hostile. i.e. it can not be trusted to be correct and so
must take slow paths to validate the inode numbers. This adds
readahead is only a win if it is low cost.
For example, on XFS every untrusted inode number lookup requires an
inode btree lookup to validate the inode is actually valid on disk
and that is it allocated and has references. That lookup serialises
against inode allocation/freeing as well as other lookups. In
comparison, when using a trusted inode number from a directory
lookup within the kernel, we only need to do a couple of shift and
mask operations to convert it to a disk address and we are good to
go.
i.e. the difference is at least 5 orders of magnitude higher CPU usage
for an "inode number readahead" syscall versus a "directory
readahead" syscall, it has significant serialisation issues and it
can stall other modification/lookups going on at the same time.
That's *horrible behaviour* for a speculative readahead operation,
but because the inodenumbers are untrusted, we can't avoid it.
So, again, it's way more overhead than userspace just calling
stat() asycnhronously on many files at once as readdir/gentdents
returns dirents from the kernel to speed up cache population.
That's my main issue with this patchset - it's implementing
something in kernelspace that can *easily* be done generically in
userspace without introducing all sorts of nasty corner cases that
we have to handle in the kernel. We only add functionality to the kernel if there's a
compelling reason to do it in kernelspace, and right now I just
don't see any numbers that justify adding readdir+stat() readahead
or inode number based cache population in kernelspace.
Before we add *any* syscall for directory readahead, we need
comparison numbers against doing the dumb multithreaded
userspace readahead of stat() calls. If userspace can do this as
fast as the kernel can....
Hi Dave/all,

I finally got around to playing with the multithreaded userspace readahead
idea and the results are quite promising. I tried to mimic what my kernel
readahead patch did with this userspace program (userspace_ra.c)
Source code here: https://www.dropbox.com/s/am9q26ndoiw1cdr/userspace_ra.c?dl=0

Each thread has an associated buffer into which a chunk of directory
entries are read in using getdents(). Each thread then sorts the entries in
inode number order (for GFS2, this is also their disk block order) and proceeds
to cache in the inodes in that order by issuing open(2) syscalls against them.
In my tests, I backgrounded this program and issued an 'ls -l' on the dir
in question. I did the same following the kernel dirreadahead syscall as well.

I did not manage to test out too many parameter combinations for both
userspace_ra and SYS_dirreadahead because the test matrix got pretty big and
time consuming. However, I did notice that without sorting, userspace_ra did
not perform as well in some of my tests. I haven't investigated that yet,
so the numbers shown here are all with sorting enabled.

For a directory with 100000 files,
a) simple 'ls -l' took 14m11s
b) SYS_dirreadahead + 'ls -l' took 3m9s, and
c) userspace_ra (1M buffer/thread, 32 threads) took 1m42s

Loading Image... is a graph
that contains a few more data points. In the graph, along with data for 'ls -l'
and SYS_dirreadahead, there are six data series for userspace_ra for each
directory size (10K, 100K and 200K files). i.e. u_ra:XXX,YYY, where XXX is one
of (64K, 1M) buffer size and YYY is one of (4, 16, 32) threads.

Cheers!
--Abhi

Thomas Martitz
2014-08-01 09:14:37 UTC
Permalink
Raw Message
Post by Andreas Dilger
Since this would be an advisory syscall (i.e. it doesn't really
return anything and cannot guarantee that all the inodes will be in
memory), then if the filesystem is having trouble prefetching the
inodes (e.g. invalid inode number(s) or lock ordering or contention
issues) it could always bail out and leave it to stat() to actually
fetch the inodes into memory when accessed.
The first sentence made me think whether the existing fadvice64() syscall
can be reused (with the dir fd) in place if the new dirreadahead()?

If not, dirreadahead should have a flags param
(https://lwn.net/Articles/585415/)

Best regards.

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Michael Kerrisk
2014-07-29 08:19:58 UTC
Permalink
Raw Message
fd - file descriptor of the directory being readahead
*offset - offset in dir from which to resume. This is updated
as we move along in the directory
count - The max number of entries to readahead
The syscall is supposed to read upto 'count' entries starting at
'*offset' and cache the inodes corresponding to those entries. It
returns a negative error code or a positive number indicating
the number of inodes it has issued readaheads for. It also
updates the '*offset' value so that repeated calls to dirreadahead
can resume at the right location. Returns 0 when there are no more
entries left.
Hello Abhi,

As per Documentation/SubmitChecklist, please CC linux-api on patches
that change the kerne-user-space API/ABI. (See
https://www.kernel.org/doc/man-pages/linux-api-ml.html for more
details.)

Cheers,

Michael
fs: Add dirreadahead syscall and VFS hooks
gfs2: GFS2's implementation of the dir_readahead file operation
arch/x86/syscalls/syscall_32.tbl | 1 +
arch/x86/syscalls/syscall_64.tbl | 1 +
fs/gfs2/Makefile | 3 +-
fs/gfs2/dir.c | 49 ++++++---
fs/gfs2/dir.h | 15 +++
fs/gfs2/dir_readahead.c | 209 +++++++++++++++++++++++++++++++++++++++
fs/gfs2/file.c | 2 +
fs/gfs2/main.c | 10 +-
fs/gfs2/super.c | 1 +
fs/readdir.c | 49 +++++++++
include/linux/fs.h | 3 +
11 files changed, 328 insertions(+), 15 deletions(-)
create mode 100644 fs/gfs2/dir_readahead.c
--
1.8.1.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
--
Michael Kerrisk Linux man-pages maintainer;
http://www.kernel.org/doc/man-pages/
Author of "The Linux Programming Interface", http://blog.man7.org/
Michael Kerrisk
2014-07-29 08:21:50 UTC
Permalink
Raw Message
[CC+=linux-api]
Post by Abhi Das
Also adds a void *opaque field to struct dir_context that can be
used by filesystems to temporarily store any context as this
struct gets passed around in the fs.
---
arch/x86/syscalls/syscall_32.tbl | 1 +
arch/x86/syscalls/syscall_64.tbl | 1 +
fs/readdir.c | 49 ++++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 3 +++
4 files changed, 54 insertions(+)
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index d6b8679..3e0ef85 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -360,3 +360,4 @@
351 i386 sched_setattr sys_sched_setattr
352 i386 sched_getattr sys_sched_getattr
353 i386 renameat2 sys_renameat2
+354 i386 dirreadahead sys_dirreadahead
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index ec255a1..2ec0991 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -323,6 +323,7 @@
314 common sched_setattr sys_sched_setattr
315 common sched_getattr sys_sched_getattr
316 common renameat2 sys_renameat2
+317 common dirreadahead sys_dirreadahead
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/readdir.c b/fs/readdir.c
index 33fd922..d216db7 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -198,6 +198,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
struct linux_dirent __user * lastdirent;
struct getdents_callback buf = {
.ctx.actor = filldir,
+ .ctx.opaque = NULL,
.count = count,
.current_dir = dirent
};
@@ -278,6 +279,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
struct linux_dirent64 __user * lastdirent;
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
+ .ctx.opaque = NULL,
.count = count,
.current_dir = dirent
};
@@ -304,3 +306,50 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
fdput(f);
return error;
}
+
+SYSCALL_DEFINE3(dirreadahead, unsigned int, fd,
+ loff_t __user *, offset, unsigned int, count)
+{
+ struct fd f;
+ struct inode *inode;
+ int error = -ENOTDIR;
+ loff_t off = 0;
+ struct dir_context ctx = {.actor = NULL, .opaque = NULL};
+
+ if (!count)
+ return -EINVAL;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ inode = f.file->f_path.dentry->d_inode;
+
+ error = -ENOTSUPP;
+ if (!f.file->f_op || !f.file->f_op->dir_readahead)
+ goto out;
+
+ error = security_file_permission(f.file, MAY_READ);
+ if (error)
+ goto out;
+
+ error = -EFAULT;
+ if (__get_user(ctx.pos, offset))
+ goto out;
+
+ error = mutex_lock_killable(&inode->i_mutex);
+ if (error)
+ goto out;
+
+ error = -ENOENT;
+ if (!IS_DEADDIR(inode)) {
+ error = f.file->f_op->dir_readahead(f.file, &ctx, count);
+ if (__put_user(ctx.pos, offset))
+ error = -EFAULT;
+ file_accessed(f.file);
+ }
+ mutex_unlock(&inode->i_mutex);
+ fdput(f);
+ return error;
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 338e6f7..fae4a6e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1438,9 +1438,11 @@ int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
* to have different dirent layouts depending on the binary type.
*/
typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
+
struct dir_context {
const filldir_t actor;
loff_t pos;
+ void *opaque;
};
struct block_device_operations;
@@ -1463,6 +1465,7 @@ struct file_operations {
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iterate) (struct file *, struct dir_context *);
+ int (*dir_readahead) (struct file *, struct dir_context *, unsigned int);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
--
1.8.1.4
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Michael Kerrisk Linux man-pages maintainer;
http://www.kernel.org/doc/man-pages/
Author of "The Linux Programming Interface", http://blog.man7.org/
Dave Chinner
2014-07-31 03:31:56 UTC
Permalink
Raw Message
Post by Michael Kerrisk
[CC+=linux-api]
Post by Abhi Das
Also adds a void *opaque field to struct dir_context that can be
used by filesystems to temporarily store any context as this
struct gets passed around in the fs.
So the prototype is:

int dir_readahead(int fd, off64_t offset, unsigned int count);

Why do we need a new syscall for this?

$ man 2 readahead
....
ssize_t readahead(int fd, off64_t offset, size_t count);
....
EINVAL fd does not refer to a file type to which readahead() can be applied.


Cheers,

Dave.
--
Dave Chinner
david-***@public.gmane.org
NeilBrown
2014-07-31 03:18:05 UTC
Permalink
Raw Message
fd - file descriptor of the directory being readahead
*offset - offset in dir from which to resume. This is updated
as we move along in the directory
count - The max number of entries to readahead
The syscall is supposed to read upto 'count' entries starting at
'*offset' and cache the inodes corresponding to those entries. It
returns a negative error code or a positive number indicating
the number of inodes it has issued readaheads for. It also
updates the '*offset' value so that repeated calls to dirreadahead
can resume at the right location. Returns 0 when there are no more
entries left.
Hi Abhi,

I like the idea of enhanced read-ahead on a directory.
It isn't clear to me why you have included these particular fields in the
interface though.

- why have an 'offset'? Why not just use the current offset of the
directory 'fd'?
- Why have a count? How would a program choose what count to give?

Maybe you imagine using 'getdents' first to get a list of names, then
selectively calling 'dirreadahead' on the offsets of the names you are
interested it? That would be racy as names can be added and removed which
might change offsets. So maybe you have another reason?

I would like to suggest an alternate interface (I love playing the API
game....).

1/ Add a flag to 'fstatat' AT_EXPECT_MORE.
If the pathname does not contain a '/', then the 'dirfd' is marked
to indicate that stat information for all names returned by getdents will
be wanted. The filesystem can choose to optimise that however it sees
fit.

2/ Add a flag to 'fstatat' AT_NONBLOCK.
This tells the filesystem that you want this information, so if it can
return it immediately it should, and if not it should start pulling it
into cache. Possibly this should be two flags: AT_NONBLOCK just avoids
any IO, and AT_ASYNC instigates IO even if NONBLOCK is set.

Then an "ls -l" could use AT_EXPECT_MORE and then just stat each name.
An "ls -l *.c", might avoid AT_EXPECT_MORE, but would use AT_NONBLOCK
against all names, then try again with all the names that returned
EWOULDBLOCK the first time.


I would really like to see the 'xstat' syscall too, but there is no point
having both "xstat" and "fxstat". Follow the model of "fstatat" and provide
just "fxstatat" which can do both.
With fxstatat, AT_EXPECT_MORE would tell the dirfd exactly which attributes
would be wanted so it can fetch only that which is desired.

I'm not very keen on the xgetdents idea of including name information and
stat information into the one syscall - I would prefer getdents and xstat be
kept separate. Of course if a genuine performance cost of the separate can
be demonstrated, I could well change my mind.

It does, however, have the advantage that the kernel doesn't need to worry
about how long read-ahead data needs to be kept, and the application doesn't
need to worry about how soon to retry an fstatat which failed with
EWOULDBLOCK.

Thanks for raising this issue again. I hope it gets fixed one day...

NeilBrown
Abhijith Das
2014-08-01 02:21:14 UTC
Permalink
Raw Message
----- Original Message -----
Sent: Wednesday, July 30, 2014 10:18:05 PM
Subject: Re: [RFC PATCH 0/2] dirreadahead system call
fd - file descriptor of the directory being readahead
*offset - offset in dir from which to resume. This is updated
as we move along in the directory
count - The max number of entries to readahead
The syscall is supposed to read upto 'count' entries starting at
'*offset' and cache the inodes corresponding to those entries. It
returns a negative error code or a positive number indicating
the number of inodes it has issued readaheads for. It also
updates the '*offset' value so that repeated calls to dirreadahead
can resume at the right location. Returns 0 when there are no more
entries left.
Hi Abhi,
I like the idea of enhanced read-ahead on a directory.
It isn't clear to me why you have included these particular fields in the
interface though.
- why have an 'offset'? Why not just use the current offset of the
directory 'fd'?
The idea was that we didn't want a syscall like readahead mucking with the
file pointer as the same fd might be used to do getdents().
- Why have a count? How would a program choose what count to give?
If a program knows that it's only going to work on a subset of files at a time,
it can use the count value to only readahead a small number of inodes at once
instead of reading ahead the entire directory.

That said, this interface is not set in stone and we are exploring ways to inform
the kernel of the inodes we are interested in reading ahead.
Maybe you imagine using 'getdents' first to get a list of names, then
selectively calling 'dirreadahead' on the offsets of the names you are
interested it? That would be racy as names can be added and removed which
might change offsets. So maybe you have another reason?
I would like to suggest an alternate interface (I love playing the API
game....).
1/ Add a flag to 'fstatat' AT_EXPECT_MORE.
If the pathname does not contain a '/', then the 'dirfd' is marked
to indicate that stat information for all names returned by getdents will
be wanted. The filesystem can choose to optimise that however it sees
fit.
2/ Add a flag to 'fstatat' AT_NONBLOCK.
This tells the filesystem that you want this information, so if it can
return it immediately it should, and if not it should start pulling it
into cache. Possibly this should be two flags: AT_NONBLOCK just avoids
any IO, and AT_ASYNC instigates IO even if NONBLOCK is set.
Then an "ls -l" could use AT_EXPECT_MORE and then just stat each name.
An "ls -l *.c", might avoid AT_EXPECT_MORE, but would use AT_NONBLOCK
against all names, then try again with all the names that returned
EWOULDBLOCK the first time.
I would really like to see the 'xstat' syscall too, but there is no point
having both "xstat" and "fxstat". Follow the model of "fstatat" and provide
just "fxstatat" which can do both.
With fxstatat, AT_EXPECT_MORE would tell the dirfd exactly which attributes
would be wanted so it can fetch only that which is desired.
I'm not very keen on the xgetdents idea of including name information and
stat information into the one syscall - I would prefer getdents and xstat be
kept separate. Of course if a genuine performance cost of the separate can
be demonstrated, I could well change my mind.
It does, however, have the advantage that the kernel doesn't need to worry
about how long read-ahead data needs to be kept, and the application doesn't
need to worry about how soon to retry an fstatat which failed with
EWOULDBLOCK.
Thanks for raising this issue again. I hope it gets fixed one day...
NeilBrown
Loading...