Discussion:
[PATCH 2/4] vfs: Define new syscalls preadv2,pwritev2
Milosz Tanski
2014-10-21 20:46:57 UTC
Permalink
New syscalls that take an flag argument. This change does not add any specific
flags.

Signed-off-by: Milosz Tanski <***@adfin.com>
---
fs/read_write.c | 82 ++++++++++++++++++++++++++++++++-------
include/linux/syscalls.h | 6 +++
include/uapi/asm-generic/unistd.h | 6 ++-
mm/filemap.c | 2 +-
4 files changed, 80 insertions(+), 16 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index 9858c06..e3d8451 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -866,6 +866,8 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
return -EBADF;
if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
+ if (flags & ~0)
+ return -EINVAL;

return do_readv_writev(READ, file, vec, vlen, pos, flags);
}
@@ -879,21 +881,23 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
+ if (flags & ~0)
+ return -EINVAL;

return do_readv_writev(WRITE, file, vec, vlen, pos, flags);
}

EXPORT_SYMBOL(vfs_writev);

-SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
- unsigned long, vlen)
+static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, int flags)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;

if (f.file) {
loff_t pos = file_pos_read(f.file);
- ret = vfs_readv(f.file, vec, vlen, &pos, 0);
+ ret = vfs_readv(f.file, vec, vlen, &pos, flags);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput_pos(f);
@@ -905,15 +909,15 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
return ret;
}

-SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
- unsigned long, vlen)
+static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, int flags)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;

if (f.file) {
loff_t pos = file_pos_read(f.file);
- ret = vfs_writev(f.file, vec, vlen, &pos, 0);
+ ret = vfs_writev(f.file, vec, vlen, &pos, flags);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput_pos(f);
@@ -931,10 +935,9 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
}

-SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
- unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, loff_t pos, int flags)
{
- loff_t pos = pos_from_hilo(pos_h, pos_l);
struct fd f;
ssize_t ret = -EBADF;

@@ -945,7 +948,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
if (f.file) {
ret = -ESPIPE;
if (f.file->f_mode & FMODE_PREAD)
- ret = vfs_readv(f.file, vec, vlen, &pos, 0);
+ ret = vfs_readv(f.file, vec, vlen, &pos, flags);
fdput(f);
}

@@ -955,10 +958,9 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
return ret;
}

-SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
- unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, loff_t pos, int flags)
{
- loff_t pos = pos_from_hilo(pos_h, pos_l);
struct fd f;
ssize_t ret = -EBADF;

@@ -969,7 +971,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
if (f.file) {
ret = -ESPIPE;
if (f.file->f_mode & FMODE_PWRITE)
- ret = vfs_writev(f.file, vec, vlen, &pos, 0);
+ ret = vfs_writev(f.file, vec, vlen, &pos, flags);
fdput(f);
}

@@ -979,6 +981,58 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
return ret;
}

+SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen)
+{
+ return do_readv(fd, vec, vlen, 0);
+}
+
+SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen)
+{
+ return do_writev(fd, vec, vlen, 0);
+}
+
+SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+ loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+ return do_preadv(fd, vec, vlen, pos, 0);
+}
+
+SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
+ int, flags)
+{
+ loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+ if (pos == -1)
+ return do_readv(fd, vec, vlen, flags);
+
+ return do_preadv(fd, vec, vlen, pos, flags);
+}
+
+SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+ loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+ return do_pwritev(fd, vec, vlen, pos, 0);
+}
+
+SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
+ int, flags)
+{
+ loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+ if (pos == -1)
+ return do_writev(fd, vec, vlen, flags);
+
+ return do_pwritev(fd, vec, vlen, pos, flags);
+}
+
#ifdef CONFIG_COMPAT

static ssize_t compat_do_readv_writev(int type, struct file *file,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index bda9b81..cedc22e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -571,8 +571,14 @@ asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
size_t count, loff_t pos);
asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
+asmlinkage long sys_preadv2(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, unsigned long pos_l, unsigned long pos_h,
+ int flags);
asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
+asmlinkage long sys_pwritev2(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, unsigned long pos_l, unsigned long pos_h,
+ int flags);
asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
asmlinkage long sys_mkdir(const char __user *pathname, umode_t mode);
asmlinkage long sys_chdir(const char __user *filename);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 22749c1..10f8883 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -213,6 +213,10 @@ __SC_COMP(__NR_pwrite64, sys_pwrite64, compat_sys_pwrite64)
__SC_COMP(__NR_preadv, sys_preadv, compat_sys_preadv)
#define __NR_pwritev 70
__SC_COMP(__NR_pwritev, sys_pwritev, compat_sys_pwritev)
+#define __NR_preadv2 281
+__SC_COMP(__NR_preadv2, sys_preadv2)
+#define __NR_pwritev2 282
+__SC_COMP(__NR_pwritev2, sys_pwritev2)

/* fs/sendfile.c */
#define __NR3264_sendfile 71
@@ -709,7 +713,7 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
__SYSCALL(__NR_bpf, sys_bpf)

#undef __NR_syscalls
-#define __NR_syscalls 281
+#define __NR_syscalls 283

/*
* All syscalls below here should go away really,
diff --git a/mm/filemap.c b/mm/filemap.c
index cb7f530..45964c8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1735,7 +1735,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
}
}

- retval = do_generic_file_read(file, ppos, iter, retval);
+ retval = do_generic_file_read(file, ppos, iter, retval, iocb->ki_rwflags);
out:
return retval;
}
--
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to ***@kvack.org. For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"***@kvack.org">***@kvack.org</a>
Milosz Tanski
2014-10-21 20:46:58 UTC
Permalink
This is only for x86_64 and x86. Will add other arch later.

Signed-off-by: Milosz Tanski <***@adfin.com>
---
arch/x86/syscalls/syscall_32.tbl | 2 ++
arch/x86/syscalls/syscall_64.tbl | 2 ++
2 files changed, 4 insertions(+)

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 9fe1b5d..d592d87 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -364,3 +364,5 @@
355 i386 getrandom sys_getrandom
356 i386 memfd_create sys_memfd_create
357 i386 bpf sys_bpf
+358 i386 preadv2 sys_preadv2
+359 i386 pwritev2 sys_pwritev2
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 281150b..7be2447 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -328,6 +328,8 @@
319 common memfd_create sys_memfd_create
320 common kexec_file_load sys_kexec_file_load
321 common bpf sys_bpf
+322 64 preadv2 sys_preadv2
+323 64 pwritev2 sys_pwritev2

#
# x32-specific system call numbers start at 512 to avoid cache impact
--
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to ***@kvack.org. For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"***@kvack.org">***@kvack.org</a>
Milosz Tanski
2014-10-21 20:46:59 UTC
Permalink
Filesystems that generic_file_read_iter will not be allowed to perform
non-blocking reads. This only will read data if it's in the page cache and if
there is no page error (causing a re-read).

Christoph Hellwig wrote the filesystem specify code (cifs, ofs, shm, xfs).

Signed-off-by: Milosz Tanski <***@adfin.com>
---
fs/cifs/file.c | 6 ++++++
fs/ocfs2/file.c | 6 ++++++
fs/pipe.c | 3 ++-
fs/read_write.c | 21 ++++++++++++++-------
fs/xfs/xfs_file.c | 4 ++++
include/linux/fs.h | 3 +++
mm/filemap.c | 18 ++++++++++++++++++
mm/shmem.c | 4 ++++
8 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3e4d00a..c485afa 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3005,6 +3005,9 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
struct cifs_readdata *rdata, *tmp;
struct list_head rdata_list;

+ if (iocb->ki_rwflags & RWF_NONBLOCK)
+ return -EAGAIN;
+
len = iov_iter_count(to);
if (!len)
return 0;
@@ -3123,6 +3126,9 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
return generic_file_read_iter(iocb, to);

+ if (iocb->ki_rwflags & RWF_NONBLOCK)
+ return -EAGAIN;
+
/*
* We need to hold the sem to be sure nobody modifies lock list
* with a brlock that prevents reading.
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 324dc93..bb66ca4 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2472,6 +2472,12 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
filp->f_path.dentry->d_name.name,
to->nr_segs); /* GRRRRR */

+ /*
+ * No non-blocking reads for ocfs2 for now. Might be doable with
+ * non-blocking cluster lock helpers.
+ */
+ if (iocb->ki_rwflags & RWF_NONBLOCK)
+ return -EAGAIN;

if (!inode) {
ret = -EINVAL;
diff --git a/fs/pipe.c b/fs/pipe.c
index 21981e5..212bf68 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -302,7 +302,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
*/
if (ret)
break;
- if (filp->f_flags & O_NONBLOCK) {
+ if ((filp->f_flags & O_NONBLOCK) ||
+ (iocb->ki_rwflags & RWF_NONBLOCK)) {
ret = -EAGAIN;
break;
}
diff --git a/fs/read_write.c b/fs/read_write.c
index e3d8451..955d829 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -835,14 +835,19 @@ static ssize_t do_readv_writev(int type, struct file *file,
file_start_write(file);
}

- if (iter_fn)
+ if (iter_fn) {
ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
pos, iter_fn, flags);
- else if (fnv)
- ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
- pos, fnv);
- else
- ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+ } else {
+ if (type == READ && (flags & RWF_NONBLOCK))
+ return -EAGAIN;
+
+ if (fnv)
+ ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
+ pos, fnv);
+ else
+ ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+ }

if (type != READ)
file_end_write(file);
@@ -866,8 +871,10 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
return -EBADF;
if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
- if (flags & ~0)
+ if (flags & ~RWF_NONBLOCK)
return -EINVAL;
+ if ((file->f_flags & O_DIRECT) && (flags & RWF_NONBLOCK))
+ return -EAGAIN;

return do_readv_writev(READ, file, vec, vlen, pos, flags);
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index eb596b4..b1f6334 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -246,6 +246,10 @@ xfs_file_read_iter(

XFS_STATS_INC(xs_read_calls);

+ /* XXX: need a non-blocking iolock helper, shouldn't be too hard */
+ if (iocb->ki_rwflags & RWF_NONBLOCK)
+ return -EAGAIN;
+
if (unlikely(file->f_flags & O_DIRECT))
ioflags |= XFS_IO_ISDIRECT;
if (file->f_mode & FMODE_NOCMTIME)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ed5711..eaebd99 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1459,6 +1459,9 @@ struct block_device_operations;
#define HAVE_COMPAT_IOCTL 1
#define HAVE_UNLOCKED_IOCTL 1

+/* These flags are used for the readv/writev syscalls with flags. */
+#define RWF_NONBLOCK 0x00000001
+
struct iov_iter;

struct file_operations {
diff --git a/mm/filemap.c b/mm/filemap.c
index 45964c8..e73ba7e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1493,6 +1493,8 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
find_page:
page = find_get_page(mapping, index);
if (!page) {
+ if (flags & RWF_NONBLOCK)
+ goto would_block;
page_cache_sync_readahead(mapping,
ra, filp,
index, last_index - index);
@@ -1584,6 +1586,11 @@ page_ok:
continue;

page_not_up_to_date:
+ if (flags & RWF_NONBLOCK) {
+ page_cache_release(page);
+ goto would_block;
+ }
+
/* Get exclusive access to the page ... */
error = lock_page_killable(page);
if (unlikely(error))
@@ -1603,6 +1610,12 @@ page_not_up_to_date_locked:
goto page_ok;
}

+ if (flags & RWF_NONBLOCK) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto would_block;
+ }
+
readpage:
/*
* A previous I/O error may have been due to temporary
@@ -1673,6 +1686,8 @@ no_cached_page:
goto readpage;
}

+would_block:
+ error = -EAGAIN;
out:
ra->prev_pos = prev_index;
ra->prev_pos <<= PAGE_CACHE_SHIFT;
@@ -1706,6 +1721,9 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
size_t count = iov_iter_count(iter);
loff_t size;

+ if (iocb->ki_rwflags & RWF_NONBLOCK)
+ return -EAGAIN;
+
if (!count)
goto out; /* skip atime */
size = i_size_read(inode);
diff --git a/mm/shmem.c b/mm/shmem.c
index cd6fc75..5c30f04 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1531,6 +1531,10 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
ssize_t retval = 0;
loff_t *ppos = &iocb->ki_pos;

+ /* XXX: should be easily supportable */
+ if (iocb->ki_rwflags & RWF_NONBLOCK)
+ return -EAGAIN;
+
/*
* Might this read be for a stacking filesystem? Then when reading
* holes of a sparse file, we actually need to allocate those pages,
--
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to ***@kvack.org. For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"***@kvack.org">***@kvack.org</a>
Milosz Tanski
2014-10-21 20:46:56 UTC
Permalink
Plumbing the flags argument through the vfs code so they can be passed down to
__generic_file_(read/write)_iter function that do the acctual work.

Signed-off-by: Milosz Tanski <***@adfin.com>
---
drivers/target/target_core_file.c | 6 +++---
fs/nfsd/vfs.c | 4 ++--
fs/read_write.c | 28 ++++++++++++++++------------
fs/splice.c | 2 +-
include/linux/aio.h | 2 ++
include/linux/fs.h | 4 ++--
mm/filemap.c | 2 +-
7 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c
index 7d6cdda..58d9a6d 100644
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -350,9 +350,9 @@ static int fd_do_rw(struct se_cmd *cmd, struct scatterlist *sgl,
set_fs(get_ds());

if (is_write)
- ret = vfs_writev(fd, &iov[0], sgl_nents, &pos);
+ ret = vfs_writev(fd, &iov[0], sgl_nents, &pos, 0);
else
- ret = vfs_readv(fd, &iov[0], sgl_nents, &pos);
+ ret = vfs_readv(fd, &iov[0], sgl_nents, &pos, 0);

set_fs(old_fs);

@@ -528,7 +528,7 @@ fd_execute_write_same(struct se_cmd *cmd)

old_fs = get_fs();
set_fs(get_ds());
- rc = vfs_writev(f, &iov[0], iov_num, &pos);
+ rc = vfs_writev(f, &iov[0], iov_num, &pos, 0);
set_fs(old_fs);

vfree(iov);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 989129e..ef01c78 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -872,7 +872,7 @@ __be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,

oldfs = get_fs();
set_fs(KERNEL_DS);
- host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
+ host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset, 0);
set_fs(oldfs);
return nfsd_finish_read(file, count, host_err);
}
@@ -960,7 +960,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,

/* Write the data. */
oldfs = get_fs(); set_fs(KERNEL_DS);
- host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos);
+ host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, 0);
set_fs(oldfs);
if (host_err < 0)
goto out_nfserr;
diff --git a/fs/read_write.c b/fs/read_write.c
index 7d9318c..9858c06 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -653,7 +653,8 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
EXPORT_SYMBOL(iov_shorten);

static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov,
- unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn)
+ unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn,
+ int flags)
{
struct kiocb kiocb;
struct iov_iter iter;
@@ -662,6 +663,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iove
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
kiocb.ki_nbytes = len;
+ kiocb.ki_rwflags = flags;

iov_iter_init(&iter, rw, iov, nr_segs, len);
ret = fn(&kiocb, &iter);
@@ -800,7 +802,8 @@ out:

static ssize_t do_readv_writev(int type, struct file *file,
const struct iovec __user * uvector,
- unsigned long nr_segs, loff_t *pos)
+ unsigned long nr_segs, loff_t *pos,
+ int flags)
{
size_t tot_len;
struct iovec iovstack[UIO_FASTIOV];
@@ -834,7 +837,7 @@ static ssize_t do_readv_writev(int type, struct file *file,

if (iter_fn)
ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
- pos, iter_fn);
+ pos, iter_fn, flags);
else if (fnv)
ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
pos, fnv);
@@ -857,27 +860,27 @@ out:
}

ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
- unsigned long vlen, loff_t *pos)
+ unsigned long vlen, loff_t *pos, int flags)
{
if (!(file->f_mode & FMODE_READ))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;

- return do_readv_writev(READ, file, vec, vlen, pos);
+ return do_readv_writev(READ, file, vec, vlen, pos, flags);
}

EXPORT_SYMBOL(vfs_readv);

ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
- unsigned long vlen, loff_t *pos)
+ unsigned long vlen, loff_t *pos, int flags)
{
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;

- return do_readv_writev(WRITE, file, vec, vlen, pos);
+ return do_readv_writev(WRITE, file, vec, vlen, pos, flags);
}

EXPORT_SYMBOL(vfs_writev);
@@ -890,7 +893,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,

if (f.file) {
loff_t pos = file_pos_read(f.file);
- ret = vfs_readv(f.file, vec, vlen, &pos);
+ ret = vfs_readv(f.file, vec, vlen, &pos, 0);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput_pos(f);
@@ -910,7 +913,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,

if (f.file) {
loff_t pos = file_pos_read(f.file);
- ret = vfs_writev(f.file, vec, vlen, &pos);
+ ret = vfs_writev(f.file, vec, vlen, &pos, 0);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput_pos(f);
@@ -942,7 +945,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
if (f.file) {
ret = -ESPIPE;
if (f.file->f_mode & FMODE_PREAD)
- ret = vfs_readv(f.file, vec, vlen, &pos);
+ ret = vfs_readv(f.file, vec, vlen, &pos, 0);
fdput(f);
}

@@ -966,7 +969,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
if (f.file) {
ret = -ESPIPE;
if (f.file->f_mode & FMODE_PWRITE)
- ret = vfs_writev(f.file, vec, vlen, &pos);
+ ret = vfs_writev(f.file, vec, vlen, &pos, 0);
fdput(f);
}

@@ -1014,7 +1017,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,

if (iter_fn)
ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
- pos, iter_fn);
+ pos, iter_fn, 0);
else if (fnv)
ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
pos, fnv);
@@ -1113,6 +1116,7 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
return __compat_sys_preadv64(fd, vec, vlen, pos);
}

+
static size_t compat_writev(struct file *file,
const struct compat_iovec __user *vec,
unsigned long vlen, loff_t *pos)
diff --git a/fs/splice.c b/fs/splice.c
index f5cb9ba..9591b9f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -576,7 +576,7 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
- res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
+ res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
set_fs(old_fs);

return res;
diff --git a/include/linux/aio.h b/include/linux/aio.h
index d9c92da..9c1d499 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -52,6 +52,8 @@ struct kiocb {
* this is the underlying eventfd context to deliver events to.
*/
struct eventfd_ctx *ki_eventfd;
+
+ int ki_rwflags;
};

static inline bool is_sync_kiocb(struct kiocb *kiocb)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a957d43..9ed5711 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1538,9 +1538,9 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
- unsigned long, loff_t *);
+ unsigned long, loff_t *, int);
extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
- unsigned long, loff_t *);
+ unsigned long, loff_t *, int);

struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
diff --git a/mm/filemap.c b/mm/filemap.c
index 14b4642..cb7f530 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1465,7 +1465,7 @@ static void shrink_readahead_size_eio(struct file *filp,
* of the logic when it comes to error handling etc.
*/
static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
- struct iov_iter *iter, ssize_t written)
+ struct iov_iter *iter, ssize_t written, int flags)
{
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
--
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to ***@kvack.org. For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"***@kvack.org">***@kvack.org</a>
Milosz Tanski
2014-10-21 20:59:02 UTC
Permalink
New syscalls that are a variation on the preadv/pwritev but support an extra
flag argument.

Signed-off-by: Milosz Tanski <***@adfin.com>
Suggested-by: Jeff Moyer <***@redhat.com>
Fixes: Jeff Moyer <***@redhat.com>
---
man2/readv.2 | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 61 insertions(+), 10 deletions(-)

diff --git a/man2/readv.2 b/man2/readv.2
index 8748efa..31b3870 100644
--- a/man2/readv.2
+++ b/man2/readv.2
@@ -45,6 +45,12 @@ readv, writev, preadv, pwritev \- read or write data into multiple buffers
.sp
.BI "ssize_t pwritev(int " fd ", const struct iovec *" iov ", int " iovcnt ,
.BI " off_t " offset );
+.sp
+.BI "ssize_t preadv2(int " fd ", const struct iovec *" iov ", int " iovcnt ,
+.BI " off_t " offset ", int " flags );
+.sp
+.BI "ssize_t pwritev2(int " fd ", const struct iovec *" iov ", int " iovcnt ,
+.BI " off_t " offset ", int " flags );
.fi
.sp
.in -4n
@@ -162,9 +168,9 @@ The
system call combines the functionality of
.BR writev ()
and
-.BR pwrite (2).
+.BR pwrite (2) "."
It performs the same task as
-.BR writev (),
+.BR writev () ","
but adds a fourth argument,
.IR offset ,
which specifies the file offset at which the output operation
@@ -174,15 +180,41 @@ The file offset is not changed by these system calls.
The file referred to by
.I fd
must be capable of seeking.
+.SS preadv2() and pwritev2()
+
+This pair of system calls has similar functionality to the
+.BR preadv ()
+and
+.BR pwritev ()
+calls, but adds a fifth argument, \fIflags\fP, which modifies the behavior on a per call basis.
+
+Like the
+.BR preadv ()
+and
+.BR pwritev ()
+calls, they accept an \fIoffset\fP argument. Unlike those calls, if the \fIoffset\fP argument is set to -1 then the current file offset is used and updated.
+
+The \fIflags\fP arguments to
+.BR preadv2 ()
+and
+.BR pwritev2 ()
+contains a bitwise OR of one or more of the following flags:
+.TP
+.BR RWF_NONBLOCK " (only " preadv2() " since Linux 3.19)"
+Performs a non-blocking operation for regular files (not sockets) opened in buffered mode (not
+.BR O_DIRECT ")."
+
.SH RETURN VALUE
On success,
-.BR readv ()
-and
+.BR readv () ","
.BR preadv ()
-return the number of bytes read;
-.BR writev ()
and
+.BR preadv2 ()
+return the number of bytes read;
+.BR writev () ","
.BR pwritev ()
+and
+.BR pwritev2 ()
return the number of bytes written.
On error, \-1 is returned, and \fIerrno\fP is set appropriately.
.SH ERRORS
@@ -191,12 +223,22 @@ The errors are as given for
and
.BR write (2).
Furthermore,
-.BR preadv ()
-and
+.BR preadv () ","
+.BR preadv2 () ","
.BR pwritev ()
+and
+.BR pwritev2 ()
can also fail for the same reasons as
.BR lseek (2).
-Additionally, the following error is defined:
+Additionally, the following errors are defined:
+.TP
+.B EAGAIN
+The operation would block. This is possible if the file descriptor \fIfd\fP refers to a socket and has been marked nonblocking
+.RB ( O_NONBLOCK ),
+or the operation is a
+.BR preadv2
+and the \fIflags\fP argument is set to
+.BR RWF_NONBLOCK.
.TP
.B EINVAL
The sum of the
@@ -205,12 +247,17 @@ values overflows an
.I ssize_t
value.
Or, the vector count \fIiovcnt\fP is less than zero or greater than the
-permitted maximum.
+permitted maximum. Or, an unknown flag is specified in \fIflags\fP.
.SH VERSIONS
.BR preadv ()
and
.BR pwritev ()
first appeared in Linux 2.6.30; library support was added in glibc 2.10.
+.sp
+.BR preadv2 ()
+and
+.BR pwritev2 ()
+first appeared in Linux 3.19 (if we're lucky);
.SH CONFORMING TO
.BR readv (),
.BR writev ():
@@ -223,6 +270,10 @@ first appeared in Linux 2.6.30; library support was added in glibc 2.10.
.BR preadv (),
.BR pwritev ():
nonstandard, but present also on the modern BSDs.
+.sp
+.BR preadv2 (),
+.BR pwritev2 ():
+nonstandard, Linux extension.
.SH NOTES
.SS C library/kernel ABI differences
POSIX.1-2001 allows an implementation to place a limit on
--
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to ***@kvack.org. For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"***@kvack.org">***@kvack.org</a>
Loading...