[PATCH v4 0/5] fuse: Add support for mounts from pid/user namespaces

Allowing unprivileged users to provide arbitrary xattrs via fuse
mounts bypasses the normal restrictions on setting xattrs. Such
mounts should be restricted to reading and writing xattrs in the
user.* namespace.

It's difficult though to tell whether a mount is being performed
on behalf of an unprivileged user since fuse mounts are ususally
done via a suid root helper. Thus a new mount option,
privileged_xattrs, is added to indicated that xattrs from other
namespaces are allowed. This option can only be supplied by
system-wide root; supplying the option as an unprivileged user
will cause the mount to fail.

Cc: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/***@public.gmane.org>
Cc: Serge H. Hallyn <serge.hallyn-GeWIH/nMZzLQT0dZR+***@public.gmane.org>
Signed-off-by: Seth Forshee <seth.forshee-Z7WLFzj8eWMS+***@public.gmane.org>
---
fs/fuse/dir.c | 9 +++++++++
fs/fuse/fuse_i.h | 5 +++++
fs/fuse/inode.c | 37 ++++++++++++++++++++++++-------------
3 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index e3123bfbc711..1bb378aa9175 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -13,6 +13,7 @@
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/slab.h>
+#include <linux/xattr.h>

static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
{
@@ -1882,6 +1883,10 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
if (fc->no_setxattr)
return -EOPNOTSUPP;

+ if (!(fc->flags & FUSE_PRIVILEGED_XATTRS) &&
+ strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) != 0)
+ return -EOPNOTSUPP;
+
req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1925,6 +1930,10 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
if (fc->no_getxattr)
return -EOPNOTSUPP;

+ if (!(fc->flags & FUSE_PRIVILEGED_XATTRS) &&
+ strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) != 0)
+ return -EOPNOTSUPP;
+
req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 81187ba04e4a..3ea4b4db9a79 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -46,6 +46,11 @@
doing the mount will be allowed to access the filesystem */
#define FUSE_ALLOW_OTHER (1 << 1)

+/** If the FUSE_PRIV_XATTRS flag is given, then xattrs outside the
+ user.* namespace are allowed. This option is only allowed for
+ system root. */
+#define FUSE_PRIVILEGED_XATTRS (1 << 2)
+
/** Number of page pointers embedded in fuse_req */
#define FUSE_REQ_INLINE_PAGES 1

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index b88b5a780228..5e00a6a76049 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -493,6 +493,7 @@ enum {
OPT_ALLOW_OTHER,
OPT_MAX_READ,
OPT_BLKSIZE,
+ OPT_PRIVILEGED_XATTRS,
OPT_ERR
};

@@ -505,6 +506,7 @@ static const match_table_t tokens = {
{OPT_ALLOW_OTHER, "allow_other"},
{OPT_MAX_READ, "max_read=%u"},
{OPT_BLKSIZE, "blksize=%u"},
+ {OPT_PRIVILEGED_XATTRS, "privileged_xattrs"},
{OPT_ERR, NULL}
};

@@ -540,35 +542,35 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
switch (token) {
case OPT_FD:
if (match_int(&args[0], &value))
- return 0;
+ return -EINVAL;
d->fd = value;
d->fd_present = 1;
break;

case OPT_ROOTMODE:
if (match_octal(&args[0], &value))
- return 0;
+ return -EINVAL;
if (!fuse_valid_type(value))
- return 0;
+ return -EINVAL;
d->rootmode = value;
d->rootmode_present = 1;
break;

case OPT_USER_ID:
if (fuse_match_uint(&args[0], &uv))
- return 0;
+ return -EINVAL;
d->user_id = make_kuid(current_user_ns(), uv);
if (!uid_valid(d->user_id))
- return 0;
+ return -EINVAL;
d->user_id_present = 1;
break;

case OPT_GROUP_ID:
if (fuse_match_uint(&args[0], &uv))
- return 0;
+ return -EINVAL;
d->group_id = make_kgid(current_user_ns(), uv);
if (!gid_valid(d->group_id))
- return 0;
+ return -EINVAL;
d->group_id_present = 1;
break;

@@ -582,26 +584,32 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)

case OPT_MAX_READ:
if (match_int(&args[0], &value))
- return 0;
+ return -EINVAL;
d->max_read = value;
break;

case OPT_BLKSIZE:
if (!is_bdev || match_int(&args[0], &value))
- return 0;
+ return -EINVAL;
d->blksize = value;
break;

+ case OPT_PRIVILEGED_XATTRS:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ d->flags |= FUSE_PRIVILEGED_XATTRS;
+ break;
+
default:
- return 0;
+ return -EINVAL;
}
}

if (!d->fd_present || !d->rootmode_present ||
!d->user_id_present || !d->group_id_present)
- return 0;
+ return -EINVAL;

- return 1;
+ return 0;
}

static int fuse_show_options(struct seq_file *m, struct dentry *root)
@@ -617,6 +625,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",default_permissions");
if (fc->flags & FUSE_ALLOW_OTHER)
seq_puts(m, ",allow_other");
+ if (fc->flags & FUSE_PRIVILEGED_XATTRS)
+ seq_puts(m, ",privileged_xattrs");
if (fc->max_read != ~0)
seq_printf(m, ",max_read=%u", fc->max_read);
if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
@@ -1058,7 +1068,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)

sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION);

- if (!parse_fuse_opt(data, &d, is_bdev))
+ err = parse_fuse_opt(data, &d, is_bdev);
+ if (err)
goto err;

if (is_bdev) {

--
2.1.0

Michael j Theall

2014-10-14 18:12:26 UTC

Date: 10/14/2014 09:27 AM
Subject: [fuse-devel] [PATCH v4 4/5] fuse: Support privileged xattrs
only with a mount option
Allowing unprivileged users to provide arbitrary xattrs via fuse
mounts bypasses the normal restrictions on setting xattrs. Such
mounts should be restricted to reading and writing xattrs in the
user.* namespace.

Can you explain how the normal restrictions on setting xattrs are
bypassed?
My filesystem still needs security.* and system.*, and it looks like
xattr_permission already prevents non-privileged users from accessing
trusted.*

It's difficult though to tell whether a mount is being performed
on behalf of an unprivileged user since fuse mounts are ususally
done via a suid root helper. Thus a new mount option,
privileged_xattrs, is added to indicated that xattrs from other
namespaces are allowed. This option can only be supplied by
system-wide root; supplying the option as an unprivileged user
will cause the mount to fail.

I can't say I'm convinced that this is the right direction to head.

Regards,
Michael Theall

Eric W. Biederman

2014-10-14 20:01:02 UTC

Can you explain how the normal restrictions on setting xattrs are
bypassed?

If the fuse server is not run by root. Which is a large part of the
point of fuse.

Post by Michael j Theall
My filesystem still needs security.* and system.*, and it looks like
xattr_permission already prevents non-privileged users from accessing
trusted.*

If the filesystem is mounted with nosuid (typical of a non-privileged
mount of fuse) then the security.* attributes are ignored.

I can't say I'm convinced that this is the right direction to head.

With respect to defaults we could keep the current default if you
have the global CAP_SYS_ADMIN privilege when the mount takes place
and then avoid breaking anything.

Eric

Seth Forshee

2014-10-14 20:59:55 UTC

Can you explain how the normal restrictions on setting xattrs are
bypassed?

If the fuse server is not run by root. Which is a large part of the
point of fuse.

So the server could for example return trusted.* xattrs which were not
set by a privileged user.

Post by Michael j Theall
My filesystem still needs security.* and system.*, and it looks like
xattr_permission already prevents non-privileged users from accessing
trusted.*

If the filesystem is mounted with nosuid (typical of a non-privileged
mount of fuse) then the security.* attributes are ignored.

That I wasn't aware of. In fact I still haven't found where this
restriction is implemented.

Nonetheless, a userns mount could be done without nosuid (though that
mount will also be unaccessible outside of that namespace).

I can't say I'm convinced that this is the right direction to head.

With respect to defaults we could keep the current default if you
have the global CAP_SYS_ADMIN privilege when the mount takes place
and then avoid breaking anything.

Except that unprivileged mounts are normally done by a suid root helper,
which is why I've required both global CAP_SYS_ADMIN and a mount option
to get the current default behavior.

Thanks,
Seth

Eric W. Biederman

2014-10-14 21:13:08 UTC

Can you explain how the normal restrictions on setting xattrs are
bypassed?

If the fuse server is not run by root. Which is a large part of the
point of fuse.

So the server could for example return trusted.* xattrs which were not
set by a privileged user.

Post by Michael j Theall
My filesystem still needs security.* and system.*, and it looks like
xattr_permission already prevents non-privileged users from accessing
trusted.*

If the filesystem is mounted with nosuid (typical of a non-privileged
mount of fuse) then the security.* attributes are ignored.

That I wasn't aware of. In fact I still haven't found where this
restriction is implemented.

My memory may be have been incomplete. What I was thinking of is
security/commoncap.c the MNT_NOSUID check in get_file_caps.

Upon inspection that appears limited to file capabilities, and while
there are a few other MNT_NOSUID checks under security the feel far from
complete.

Sigh.

This deserves a hard look because if MNT_NOSUID is not sufficient than
it may be possible for me to insert a usb stick with an extN filesystem
with the right labels having it auto-mounted nosuid and subvert the
security of something like selinux.

Post by Seth Forshee
Nonetheless, a userns mount could be done without nosuid (though that
mount will also be unaccessible outside of that namespace).

I can't say I'm convinced that this is the right direction to head.

With respect to defaults we could keep the current default if you
have the global CAP_SYS_ADMIN privilege when the mount takes place
and then avoid breaking anything.

Except that unprivileged mounts are normally done by a suid root helper,
which is why I've required both global CAP_SYS_ADMIN and a mount option
to get the current default behavior.

If nosuid is sufficient that may break existing setups for no good
reason.

Shrug. I won't have much time for a bit but I figured I would highlight
the potential security hole in existing setups. So someone with time
this week can look at that.

Eric

Andy Lutomirski

2014-10-14 21:19:19 UTC

On Tue, Oct 14, 2014 at 2:13 PM, Eric W. Biederman

Can you explain how the normal restrictions on setting xattrs are
bypassed?

If the fuse server is not run by root. Which is a large part of the
point of fuse.

So the server could for example return trusted.* xattrs which were not
set by a privileged user.

Post by Michael j Theall
My filesystem still needs security.* and system.*, and it looks like
xattr_permission already prevents non-privileged users from accessing
trusted.*

If the filesystem is mounted with nosuid (typical of a non-privileged
mount of fuse) then the security.* attributes are ignored.

That I wasn't aware of. In fact I still haven't found where this
restriction is implemented.

My memory may be have been incomplete. What I was thinking of is
security/commoncap.c the MNT_NOSUID check in get_file_caps.
Upon inspection that appears limited to file capabilities, and while
there are a few other MNT_NOSUID checks under security the feel far from
complete.
Sigh.
This deserves a hard look because if MNT_NOSUID is not sufficient than
it may be possible for me to insert a usb stick with an extN filesystem
with the right labels having it auto-mounted nosuid and subvert the
security of something like selinux.

It's this code in selinux/hooks.c:

if ((bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) ||
(bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS))
new_tsec->sid = old_tsec->sid;

One might argue that this should actually generate -EPERM instead of
ignoring the label, but it should be safe against untrusted media.

Post by Seth Forshee
Nonetheless, a userns mount could be done without nosuid (though that
mount will also be unaccessible outside of that namespace).

I can't say I'm convinced that this is the right direction to head.

With respect to defaults we could keep the current default if you
have the global CAP_SYS_ADMIN privilege when the mount takes place
and then avoid breaking anything.

Except that unprivileged mounts are normally done by a suid root helper,
which is why I've required both global CAP_SYS_ADMIN and a mount option
to get the current default behavior.

If nosuid is sufficient that may break existing setups for no good
reason.
Shrug. I won't have much time for a bit but I figured I would highlight
the potential security hole in existing setups. So someone with time
this week can look at that.

I think I have a better solution. I'll send it out.

Serge had also mentioned adding some kind of hook to help LSMs handle
user namespaces more intelligently.

--Andy

Post by Eric W. Biederman
Eric

--
Andy Lutomirski
AMA Capital Management, LLC

Eric W. Biederman

2014-10-14 21:29:13 UTC

Post by Andy Lutomirski
On Tue, Oct 14, 2014 at 2:13 PM, Eric W. Biederman

Can you explain how the normal restrictions on setting xattrs are
bypassed?

If the fuse server is not run by root. Which is a large part of the
point of fuse.

So the server could for example return trusted.* xattrs which were not
set by a privileged user.

Post by Michael j Theall
My filesystem still needs security.* and system.*, and it looks like
xattr_permission already prevents non-privileged users from accessing
trusted.*

If the filesystem is mounted with nosuid (typical of a non-privileged
mount of fuse) then the security.* attributes are ignored.

That I wasn't aware of. In fact I still haven't found where this
restriction is implemented.

My memory may be have been incomplete. What I was thinking of is
security/commoncap.c the MNT_NOSUID check in get_file_caps.
Upon inspection that appears limited to file capabilities, and while
there are a few other MNT_NOSUID checks under security the feel far from
complete.
Sigh.
This deserves a hard look because if MNT_NOSUID is not sufficient than
it may be possible for me to insert a usb stick with an extN filesystem
with the right labels having it auto-mounted nosuid and subvert the
security of something like selinux.

if ((bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) ||
(bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS))
new_tsec->sid = old_tsec->sid;
One might argue that this should actually generate -EPERM instead of
ignoring the label, but it should be safe against untrusted media.

Fair enough. Smack does not replicate any form of that check so smack
appears vulnerable to untrusted media.

I don't think we have any other security modules beyond smack and
selinux that use labels.

Post by Seth Forshee
Nonetheless, a userns mount could be done without nosuid (though that
mount will also be unaccessible outside of that namespace).

I can't say I'm convinced that this is the right direction to head.

With respect to defaults we could keep the current default if you
have the global CAP_SYS_ADMIN privilege when the mount takes place
and then avoid breaking anything.

Except that unprivileged mounts are normally done by a suid root helper,
which is why I've required both global CAP_SYS_ADMIN and a mount option
to get the current default behavior.

If nosuid is sufficient that may break existing setups for no good
reason.
Shrug. I won't have much time for a bit but I figured I would highlight
the potential security hole in existing setups. So someone with time
this week can look at that.

I think I have a better solution. I'll send it out.
Serge had also mentioned adding some kind of hook to help LSMs handle
user namespaces more intelligently.

Eric

Seth Forshee

2014-10-15 07:39:51 UTC

Post by Andy Lutomirski
On Tue, Oct 14, 2014 at 2:13 PM, Eric W. Biederman

Can you explain how the normal restrictions on setting xattrs are
bypassed?

If the fuse server is not run by root. Which is a large part of the
point of fuse.

So the server could for example return trusted.* xattrs which were not
set by a privileged user.

Post by Michael j Theall
My filesystem still needs security.* and system.*, and it looks like
xattr_permission already prevents non-privileged users from accessing
trusted.*

If the filesystem is mounted with nosuid (typical of a non-privileged
mount of fuse) then the security.* attributes are ignored.

That I wasn't aware of. In fact I still haven't found where this
restriction is implemented.

My memory may be have been incomplete. What I was thinking of is
security/commoncap.c the MNT_NOSUID check in get_file_caps.
Upon inspection that appears limited to file capabilities, and while
there are a few other MNT_NOSUID checks under security the feel far from
complete.
Sigh.
This deserves a hard look because if MNT_NOSUID is not sufficient than
it may be possible for me to insert a usb stick with an extN filesystem
with the right labels having it auto-mounted nosuid and subvert the
security of something like selinux.

Post by Seth Forshee
Nonetheless, a userns mount could be done without nosuid (though that
mount will also be unaccessible outside of that namespace).

I can't say I'm convinced that this is the right direction to head.

With respect to defaults we could keep the current default if you
have the global CAP_SYS_ADMIN privilege when the mount takes place
and then avoid breaking anything.

Except that unprivileged mounts are normally done by a suid root helper,
which is why I've required both global CAP_SYS_ADMIN and a mount option
to get the current default behavior.

If nosuid is sufficient that may break existing setups for no good
reason.
Shrug. I won't have much time for a bit but I figured I would highlight
the potential security hole in existing setups. So someone with time
this week can look at that.

I think I have a better solution. I'll send it out.

To be honest I don't understand enough about how selinux uses security
labels to know what level of paranoia is appropriate, so I wrote this
out of an excess of paranoia. If the patch you sent restricts things
sufficiently then I'm perfectly happy to see this patch dropped.

And really with fuse all of this is really excess paranoia because (if
my other patches are applied at least) the fuse mount will be
inaccessible to any user outside the user namespace from which it was
mounted or its descendants.

Thanks,
Seth

Andy Lutomirski

2014-10-15 14:37:36 UTC

On Wed, Oct 15, 2014 at 12:39 AM, Seth Forshee

Post by Andy Lutomirski
On Tue, Oct 14, 2014 at 2:13 PM, Eric W. Biederman

Can you explain how the normal restrictions on setting xattrs are
bypassed?

If the fuse server is not run by root. Which is a large part of the
point of fuse.

So the server could for example return trusted.* xattrs which were not
set by a privileged user.

Post by Michael j Theall
My filesystem still needs security.* and system.*, and it looks like
xattr_permission already prevents non-privileged users from accessing
trusted.*

If the filesystem is mounted with nosuid (typical of a non-privileged
mount of fuse) then the security.* attributes are ignored.

That I wasn't aware of. In fact I still haven't found where this
restriction is implemented.

My memory may be have been incomplete. What I was thinking of is
security/commoncap.c the MNT_NOSUID check in get_file_caps.
Upon inspection that appears limited to file capabilities, and while
there are a few other MNT_NOSUID checks under security the feel far from
complete.
Sigh.
This deserves a hard look because if MNT_NOSUID is not sufficient than
it may be possible for me to insert a usb stick with an extN filesystem
with the right labels having it auto-mounted nosuid and subvert the
security of something like selinux.

Post by Seth Forshee
Nonetheless, a userns mount could be done without nosuid (though that
mount will also be unaccessible outside of that namespace).

I can't say I'm convinced that this is the right direction to head.

With respect to defaults we could keep the current default if you
have the global CAP_SYS_ADMIN privilege when the mount takes place
and then avoid breaking anything.

Except that unprivileged mounts are normally done by a suid root helper,
which is why I've required both global CAP_SYS_ADMIN and a mount option
to get the current default behavior.

If nosuid is sufficient that may break existing setups for no good
reason.
Shrug. I won't have much time for a bit but I figured I would highlight
the potential security hole in existing setups. So someone with time
this week can look at that.

I think I have a better solution. I'll send it out.

To be honest I don't understand enough about how selinux uses security
labels to know what level of paranoia is appropriate, so I wrote this
out of an excess of paranoia. If the patch you sent restricts things
sufficiently then I'm perfectly happy to see this patch dropped.
And really with fuse all of this is really excess paranoia because (if
my other patches are applied at least) the fuse mount will be
inaccessible to any user outside the user namespace from which it was
mounted or its descendants.

I missed the rest of the series. This is exciting!

I'm not sure that the other protections you have are quite sufficient,
though, without something like my patch. I'll comment on the rest.

--Andy

Seth Forshee

2014-10-21 21:21:51 UTC

Post by Andy Lutomirski
On Wed, Oct 15, 2014 at 12:39 AM, Seth Forshee

Post by Andy Lutomirski
On Tue, Oct 14, 2014 at 2:13 PM, Eric W. Biederman

Can you explain how the normal restrictions on setting xattrs are
bypassed?

If the fuse server is not run by root. Which is a large part of the
point of fuse.

So the server could for example return trusted.* xattrs which were not
set by a privileged user.

Post by Michael j Theall
My filesystem still needs security.* and system.*, and it looks like
xattr_permission already prevents non-privileged users from accessing
trusted.*

If the filesystem is mounted with nosuid (typical of a non-privileged
mount of fuse) then the security.* attributes are ignored.

That I wasn't aware of. In fact I still haven't found where this
restriction is implemented.

My memory may be have been incomplete. What I was thinking of is
security/commoncap.c the MNT_NOSUID check in get_file_caps.
Upon inspection that appears limited to file capabilities, and while
there are a few other MNT_NOSUID checks under security the feel far from
complete.
Sigh.
This deserves a hard look because if MNT_NOSUID is not sufficient than
it may be possible for me to insert a usb stick with an extN filesystem
with the right labels having it auto-mounted nosuid and subvert the
security of something like selinux.

Post by Seth Forshee
Nonetheless, a userns mount could be done without nosuid (though that
mount will also be unaccessible outside of that namespace).

I can't say I'm convinced that this is the right direction to head.

With respect to defaults we could keep the current default if you
have the global CAP_SYS_ADMIN privilege when the mount takes place
and then avoid breaking anything.

Except that unprivileged mounts are normally done by a suid root helper,
which is why I've required both global CAP_SYS_ADMIN and a mount option
to get the current default behavior.

If nosuid is sufficient that may break existing setups for no good
reason.
Shrug. I won't have much time for a bit but I figured I would highlight
the potential security hole in existing setups. So someone with time
this week can look at that.

I think I have a better solution. I'll send it out.

To be honest I don't understand enough about how selinux uses security
labels to know what level of paranoia is appropriate, so I wrote this
out of an excess of paranoia. If the patch you sent restricts things
sufficiently then I'm perfectly happy to see this patch dropped.
And really with fuse all of this is really excess paranoia because (if
my other patches are applied at least) the fuse mount will be
inaccessible to any user outside the user namespace from which it was
mounted or its descendants.

I missed the rest of the series. This is exciting!
I'm not sure that the other protections you have are quite sufficient,
though, without something like my patch. I'll comment on the rest.

I still suspect we should be doing more to limit xattrs from userns
mounts, since normally only root is allowed to set trusted.* and
security.* xattrs. Seems like this should be done more generally though
and not just specific to fuse. Something like this maybe? It probably
won't matter much for fuse mounts since they won't be accessible outside
the userns which did the mount, but for other filesystems the xattrs
could be set externally and injected into the system via a userns mount.

diff --git a/fs/super.c b/fs/super.c
index eae088f6aaae..499cd7d2d2f8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -149,6 +149,7 @@ static void destroy_super(struct super_block *s)
percpu_counter_destroy(&s->s_writers.counter[i]);
security_sb_free(s);
WARN_ON(!list_empty(&s->s_mounts));
+ put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
kfree(s->s_options);
kfree_rcu(s, rcu);
@@ -230,6 +231,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
s->s_shrink.count_objects = super_cache_count;
s->s_shrink.batch = 1024;
s->s_shrink.flags = SHRINKER_NUMA_AWARE;
+
+ s->s_user_ns = get_user_ns(&init_user_ns);
return s;

fail:
diff --git a/fs/xattr.c b/fs/xattr.c
index 64e83efb742d..383bb9f25555 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -40,6 +40,12 @@ xattr_permission(struct inode *inode, const char *name, int mask)
return -EPERM;
}

+ /* Restrict security.* and trusted.* to mounts from init_user_ns. */
+ if (inode->i_sb->s_user_ns != &init_user_ns &&
+ (!strcmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
+ !strcmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)))
+ return -EPERM;
+
/*
* No restriction for security.* and system.* from the VFS. Decision
* on these is left to the underlying filesystem / security module.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a957d4366c24..786c5e9c845f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1259,6 +1259,8 @@ struct super_block {
struct workqueue_struct *s_dio_done_wq;
struct hlist_head s_pins;

+ struct user_namespace *s_user_ns;
+
/*
* Keep the lru lists last in the structure so they always sit on their
* own individual cachelines.

Andy Lutomirski

2014-10-21 21:27:13 UTC

On Tue, Oct 21, 2014 at 2:21 PM, Seth Forshee

Post by Andy Lutomirski
On Wed, Oct 15, 2014 at 12:39 AM, Seth Forshee

Post by Andy Lutomirski
On Tue, Oct 14, 2014 at 2:13 PM, Eric W. Biederman

Can you explain how the normal restrictions on setting xattrs are
bypassed?

If the fuse server is not run by root. Which is a large part of the
point of fuse.

So the server could for example return trusted.* xattrs which were not
set by a privileged user.

Post by Michael j Theall
My filesystem still needs security.* and system.*, and it looks like
xattr_permission already prevents non-privileged users from accessing
trusted.*

If the filesystem is mounted with nosuid (typical of a non-privileged
mount of fuse) then the security.* attributes are ignored.

That I wasn't aware of. In fact I still haven't found where this
restriction is implemented.

My memory may be have been incomplete. What I was thinking of is
security/commoncap.c the MNT_NOSUID check in get_file_caps.
Upon inspection that appears limited to file capabilities, and while
there are a few other MNT_NOSUID checks under security the feel far from
complete.
Sigh.
This deserves a hard look because if MNT_NOSUID is not sufficient than
it may be possible for me to insert a usb stick with an extN filesystem
with the right labels having it auto-mounted nosuid and subvert the
security of something like selinux.

Post by Seth Forshee
Nonetheless, a userns mount could be done without nosuid (though that
mount will also be unaccessible outside of that namespace).

I can't say I'm convinced that this is the right direction to head.

With respect to defaults we could keep the current default if you
have the global CAP_SYS_ADMIN privilege when the mount takes place
and then avoid breaking anything.

Except that unprivileged mounts are normally done by a suid root helper,
which is why I've required both global CAP_SYS_ADMIN and a mount option
to get the current default behavior.

If nosuid is sufficient that may break existing setups for no good
reason.
Shrug. I won't have much time for a bit but I figured I would highlight
the potential security hole in existing setups. So someone with time
this week can look at that.

I think I have a better solution. I'll send it out.

To be honest I don't understand enough about how selinux uses security
labels to know what level of paranoia is appropriate, so I wrote this
out of an excess of paranoia. If the patch you sent restricts things
sufficiently then I'm perfectly happy to see this patch dropped.
And really with fuse all of this is really excess paranoia because (if
my other patches are applied at least) the fuse mount will be
inaccessible to any user outside the user namespace from which it was
mounted or its descendants.

I missed the rest of the series. This is exciting!
I'm not sure that the other protections you have are quite sufficient,
though, without something like my patch. I'll comment on the rest.

Huh? I think I like this in principle, but shouldn't this be the
actual userns doing the mount?

Post by Seth Forshee
return s;
diff --git a/fs/xattr.c b/fs/xattr.c
index 64e83efb742d..383bb9f25555 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -40,6 +40,12 @@ xattr_permission(struct inode *inode, const char *name, int mask)
return -EPERM;
}
+ /* Restrict security.* and trusted.* to mounts from init_user_ns. */
+ if (inode->i_sb->s_user_ns != &init_user_ns &&
+ (!strcmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
+ !strcmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)))
+ return -EPERM;
+

trusted.* should be fine already, I think -- it checks global
capabilities. And I still think that security.* should be left to
LSMs, which IMO really do need to be fixed for user namespaces.

But how does this help with FUSE at all? Does FUSE end up calling
xattr_permission?

--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

Michael j Theall

2014-10-21 21:34:05 UTC

Date: 10/21/2014 04:27 PM
Subject: Re: [fuse-devel] [PATCH v4 4/5] fuse: Support privileged
xattrs only with a mount option
On Tue, Oct 21, 2014 at 2:21 PM, Seth Forshee

Post by Andy Lutomirski
On Wed, Oct 15, 2014 at 12:39 AM, Seth Forshee

Post by Andy Lutomirski
On Tue, Oct 14, 2014 at 2:13 PM, Eric W. Biederman

Date: 10/14/2014 09:27 AM
Subject: [fuse-devel] [PATCH v4 4/5] fuse: Support

privileged xattrs

only with a mount option
Allowing unprivileged users to provide arbitrary xattrs via fuse
mounts bypasses the normal restrictions on setting xattrs. Such
mounts should be restricted to reading and writing xattrs in the
user.* namespace.

Can you explain how the normal restrictions on setting xattrs are
bypassed?

If the fuse server is not run by root. Which is a large part of the
point of fuse.

So the server could for example return trusted.* xattrs

which were not

Post by Seth Forshee
set by a privileged user.

Post by Michael j Theall
My filesystem still needs security.* and system.*, and

it looks like

Post by Michael j Theall
xattr_permission already prevents non-privileged users

from accessing

Post by Michael j Theall
trusted.*

If the filesystem is mounted with nosuid (typical of a

non-privileged

Post by Eric W. Biederman
mount of fuse) then the security.* attributes are ignored.

That I wasn't aware of. In fact I still haven't found where this
restriction is implemented.

feel far from

Post by Eric W. Biederman
complete.
Sigh.
This deserves a hard look because if MNT_NOSUID is not

sufficient than

Post by Eric W. Biederman
it may be possible for me to insert a usb stick with an

extNfilesystem

Post by Eric W. Biederman
with the right labels having it auto-mounted nosuid and subvert the
security of something like selinux.

Post by Seth Forshee
Nonetheless, a userns mount could be done without nosuid (though that
mount will also be unaccessible outside of that namespace).

I can't say I'm convinced that this is the right

direction to head.

Post by Eric W. Biederman
With respect to defaults we could keep the current default if you
have the global CAP_SYS_ADMIN privilege when the mount takes place
and then avoid breaking anything.

Except that unprivileged mounts are normally done by a suid

root helper,

Post by Seth Forshee
which is why I've required both global CAP_SYS_ADMIN and a

mount option

Post by Seth Forshee
to get the current default behavior.

If nosuid is sufficient that may break existing setups for no good
reason.
Shrug. I won't have much time for a bit but I figured I

would highlight

Post by Eric W. Biederman
the potential security hole in existing setups. So someone with time
this week can look at that.

I think I have a better solution. I'll send it out.

To be honest I don't understand enough about how selinux uses security
labels to know what level of paranoia is appropriate, so I wrote this
out of an excess of paranoia. If the patch you sent restricts things
sufficiently then I'm perfectly happy to see this patch dropped.
And really with fuse all of this is really excess paranoia because (if
my other patches are applied at least) the fuse mount will be
inaccessible to any user outside the user namespace from which it was
mounted or its descendants.

I missed the rest of the series. This is exciting!
I'm not sure that the other protections you have are quite

sufficient,

Post by Andy Lutomirski
though, without something like my patch. I'll comment on the rest.

file_system_type *type, int flags)

Post by Seth Forshee
s->s_shrink.count_objects = super_cache_count;
s->s_shrink.batch = 1024;
s->s_shrink.flags = SHRINKER_NUMA_AWARE;
+
+ s->s_user_ns = get_user_ns(&init_user_ns);

Huh? I think I like this in principle, but shouldn't this be the
actual userns doing the mount?

char *name, int mask)

Post by Seth Forshee
return -EPERM;
}
+ /* Restrict security.* and trusted.* to mounts from

init_user_ns. */

Post by Seth Forshee
+ if (inode->i_sb->s_user_ns != &init_user_ns &&
+ (!strcmp(name, XATTR_SECURITY_PREFIX,

XATTR_SECURITY_PREFIX_LEN) ||

Post by Seth Forshee
+ !strcmp(name, XATTR_TRUSTED_PREFIX,

XATTR_TRUSTED_PREFIX_LEN)))

Post by Seth Forshee
+ return -EPERM;
+

The xattr system calls go through xattr_permission before it ever gets to
the FUSE ops.

Regards,
Michael Theall

Andy Lutomirski

2014-10-21 21:44:31 UTC

Post by Andy Lutomirski
But how does this help with FUSE at all? Does FUSE end up calling
xattr_permission?
--Andy

The xattr system calls go through xattr_permission before it ever gets to
the FUSE ops.

But a malicious FUSE filesystem can just put those xattrs there by
fiat, the same way that my old FUSE-based sploit put a setuid root
copy of bash in the filesystem. No setxattr calls are needed.

--Andy

Seth Forshee

2014-10-22 04:58:48 UTC

Post by Andy Lutomirski
On Tue, Oct 21, 2014 at 2:21 PM, Seth Forshee

Post by Andy Lutomirski
On Wed, Oct 15, 2014 at 12:39 AM, Seth Forshee

Post by Andy Lutomirski
On Tue, Oct 14, 2014 at 2:13 PM, Eric W. Biederman

Can you explain how the normal restrictions on setting xattrs are
bypassed?

If the fuse server is not run by root. Which is a large part of the
point of fuse.

So the server could for example return trusted.* xattrs which were not
set by a privileged user.

Post by Michael j Theall
My filesystem still needs security.* and system.*, and it looks like
xattr_permission already prevents non-privileged users from accessing
trusted.*

If the filesystem is mounted with nosuid (typical of a non-privileged
mount of fuse) then the security.* attributes are ignored.

That I wasn't aware of. In fact I still haven't found where this
restriction is implemented.

My memory may be have been incomplete. What I was thinking of is
security/commoncap.c the MNT_NOSUID check in get_file_caps.
Upon inspection that appears limited to file capabilities, and while
there are a few other MNT_NOSUID checks under security the feel far from
complete.
Sigh.
This deserves a hard look because if MNT_NOSUID is not sufficient than
it may be possible for me to insert a usb stick with an extN filesystem
with the right labels having it auto-mounted nosuid and subvert the
security of something like selinux.

Post by Seth Forshee
Nonetheless, a userns mount could be done without nosuid (though that
mount will also be unaccessible outside of that namespace).

I can't say I'm convinced that this is the right direction to head.

With respect to defaults we could keep the current default if you
have the global CAP_SYS_ADMIN privilege when the mount takes place
and then avoid breaking anything.

Except that unprivileged mounts are normally done by a suid root helper,
which is why I've required both global CAP_SYS_ADMIN and a mount option
to get the current default behavior.

If nosuid is sufficient that may break existing setups for no good
reason.
Shrug. I won't have much time for a bit but I figured I would highlight
the potential security hole in existing setups. So someone with time
this week can look at that.

I think I have a better solution. I'll send it out.

To be honest I don't understand enough about how selinux uses security
labels to know what level of paranoia is appropriate, so I wrote this
out of an excess of paranoia. If the patch you sent restricts things
sufficiently then I'm perfectly happy to see this patch dropped.
And really with fuse all of this is really excess paranoia because (if
my other patches are applied at least) the fuse mount will be
inaccessible to any user outside the user namespace from which it was
mounted or its descendants.

I missed the rest of the series. This is exciting!
I'm not sure that the other protections you have are quite sufficient,
though, without something like my patch. I'll comment on the rest.

Huh? I think I like this in principle, but shouldn't this be the
actual userns doing the mount?

Probably, or else the fs should change it. The reason I'm not sure yet
is that I also started poking at adding userns support to ext4 the other
day, and for that I'm using s_user_ns to do the translations in
i_[ug]id_(read|write) and I still need to verify that it won't break
anything for other filesystems that support userns mounts. But you're
right; as I've shown it here the changes are ineffective.

It gets called from vfs_getxattr, and thus for the getxattr syscall for
all fs types, so this would block reading any trusted.* xattrs from the
fuse userspace process.

But like I said before, the access restrictions that are in place should
prevent this from really being a problem, so these changes could
probably wait. The one thing it would change is that if we have
s_user_ns in the superblock I'd probably make fuse use that instead of
storing it in fs-internal data, but that can always be changed later.

Thanks,
Seth

Andy Lutomirski

2014-10-23 18:32:41 UTC

Post by Andy Lutomirski
On Tue, Oct 21, 2014 at 2:21 PM, Seth Forshee

It gets called from vfs_getxattr, and thus for the getxattr syscall for
all fs types, so this would block reading any trusted.* xattrs from the
fuse userspace process.

Oh. It seems weird to me that getxattr would get an error instead of
FUSE being prevented from setting those attributes.

I'm still unconvinced that this is the right approach. And anything
that tries to use LSMs in a container will eventually want those
attributes.

--Andy

Seth Forshee

2014-10-14 14:25:53 UTC

Update fuse to translate uids and gids to/from the user namspace
of the process servicing requests on /dev/fuse. Any ids which do
not map into the namespace will result in errors. inodes will
also be marked bad when unmappable ids are received from
userspace.

Due to security concerns the namespace used should be fixed,
otherwise a user might be able to gain elevated privileges or
influence processes that the user would otherwise be unable to
manipulate. Thus the namespace of the mounting process is used
for all translations, and this namespace is required to be the
same as the one in use when /dev/fuse was opened.

Cc: Eric W. Biederman <***@xmission.com>
Cc: Serge H. Hallyn <***@ubuntu.com>
Signed-off-by: Seth Forshee <***@canonical.com>
---
fs/fuse/dev.c | 4 +--
fs/fuse/dir.c | 81 ++++++++++++++++++++++++++++++++++++--------------------
fs/fuse/fuse_i.h | 12 ++++++---
fs/fuse/inode.c | 73 +++++++++++++++++++++++++++++++++++++++-----------
4 files changed, 121 insertions(+), 49 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 839caebd34f1..03c8785ed731 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -127,8 +127,8 @@ static void __fuse_put_request(struct fuse_req *req)

static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req)
{
- req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
- req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
+ req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
+ req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
}

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index de1d84af9f7c..123db1e06c78 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -253,9 +253,12 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
goto invalid;

- fuse_change_attributes(inode, &outarg.attr,
- entry_attr_timeout(&outarg),
- attr_version);
+ err = fuse_change_attributes(inode, &outarg.attr,
+ entry_attr_timeout(&outarg),
+ attr_version);
+ if (err)
+ goto invalid;
+
fuse_change_entry_timeout(entry, &outarg);
} else if (inode) {
fi = get_fuse_inode(inode);
@@ -340,8 +343,9 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
&outarg->attr, entry_attr_timeout(outarg),
attr_version);
- err = -ENOMEM;
- if (!*inode) {
+ if (IS_ERR(*inode)) {
+ err = PTR_ERR(*inode);
+ *inode = NULL;
fuse_queue_forget(fc, forget, outarg->nodeid, 1);
goto out;
}
@@ -473,11 +477,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
ff->open_flags = outopen.open_flags;
inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
&outentry.attr, entry_attr_timeout(&outentry), 0);
- if (!inode) {
+ if (IS_ERR(inode)) {
flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
fuse_sync_release(ff, flags);
fuse_queue_forget(fc, forget, outentry.nodeid, 1);
- err = -ENOMEM;
+ err = PTR_ERR(inode);
goto out_err;
}
kfree(forget);
@@ -588,9 +592,9 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,

inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
&outarg.attr, entry_attr_timeout(&outarg), 0);
- if (!inode) {
+ if (IS_ERR(inode)) {
fuse_queue_forget(fc, forget, outarg.nodeid, 1);
- return -ENOMEM;
+ return PTR_ERR(inode);
}
kfree(forget);

@@ -905,8 +909,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
stat->ino = attr->ino;
stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
stat->nlink = attr->nlink;
- stat->uid = make_kuid(&init_user_ns, attr->uid);
- stat->gid = make_kgid(&init_user_ns, attr->gid);
+ stat->uid = inode->i_uid;
+ stat->gid = inode->i_gid;
stat->rdev = inode->i_rdev;
stat->atime.tv_sec = attr->atime;
stat->atime.tv_nsec = attr->atimensec;
@@ -969,10 +973,10 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
make_bad_inode(inode);
err = -EIO;
} else {
- fuse_change_attributes(inode, &outarg.attr,
- attr_timeout(&outarg),
- attr_version);
- if (stat)
+ err = fuse_change_attributes(inode, &outarg.attr,
+ attr_timeout(&outarg),
+ attr_version);
+ if (!err && stat)
fuse_fillattr(inode, &outarg.attr, stat);
}
}
@@ -1302,9 +1306,11 @@ static int fuse_direntplus_link(struct file *file,
fi->nlookup++;
spin_unlock(&fc->lock);

- fuse_change_attributes(inode, &o->attr,
- entry_attr_timeout(o),
- attr_version);
+ err = fuse_change_attributes(inode, &o->attr,
+ entry_attr_timeout(o),
+ attr_version);
+ if (err)
+ goto out;

/*
* The other branch to 'found' comes via fuse_iget()
@@ -1322,8 +1328,10 @@ static int fuse_direntplus_link(struct file *file,

inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
&o->attr, entry_attr_timeout(o), attr_version);
- if (!inode)
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
goto out;
+ }

alias = d_materialise_unique(dentry, inode);
err = PTR_ERR(alias);
@@ -1556,17 +1564,25 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
return true;
}

-static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
- bool trust_local_cmtime)
+static int iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr,
+ struct fuse_setattr_in *arg, bool trust_local_cmtime)
{
unsigned ivalid = iattr->ia_valid;

if (ivalid & ATTR_MODE)
arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode;
- if (ivalid & ATTR_UID)
- arg->valid |= FATTR_UID, arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);
- if (ivalid & ATTR_GID)
- arg->valid |= FATTR_GID, arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);
+ if (ivalid & ATTR_UID) {
+ arg->uid = from_kuid(fc->user_ns, iattr->ia_uid);
+ if (arg->uid == (uid_t)-1)
+ return -EINVAL;
+ arg->valid |= FATTR_UID;
+ }
+ if (ivalid & ATTR_GID) {
+ arg->gid = from_kgid(fc->user_ns, iattr->ia_gid);
+ if (arg->gid == (gid_t)-1)
+ return -EINVAL;
+ arg->valid |= FATTR_GID;
+ }
if (ivalid & ATTR_SIZE)
arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size;
if (ivalid & ATTR_ATIME) {
@@ -1588,6 +1604,8 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
arg->ctime = iattr->ia_ctime.tv_sec;
arg->ctimensec = iattr->ia_ctime.tv_nsec;
}
+
+ return 0;
}

/*
@@ -1741,7 +1759,9 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,

memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
- iattr_to_fattr(attr, &inarg, trust_local_cmtime);
+ err = iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime);
+ if (err)
+ goto error;
if (file) {
struct fuse_file *ff = file->private_data;
inarg.valid |= FATTR_FH;
@@ -1778,8 +1798,13 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
/* FIXME: clear I_DIRTY_SYNC? */
}

- fuse_change_attributes_common(inode, &outarg.attr,
- attr_timeout(&outarg));
+ err = fuse_change_attributes_common(inode, &outarg.attr,
+ attr_timeout(&outarg));
+ if (err) {
+ spin_unlock(&fc->lock);
+ goto error;
+ }
+
oldsize = inode->i_size;
/* see the comment in fuse_change_attributes() */
if (!is_wb || is_truncate || !S_ISREG(inode->i_mode))
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index a3ded071e2c6..81187ba04e4a 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -22,6 +22,7 @@
#include <linux/rbtree.h>
#include <linux/poll.h>
#include <linux/workqueue.h>
+#include <linux/user_namespace.h>
#include <linux/pid_namespace.h>

/** Max number of pages that can be used in a single read request */
@@ -387,6 +388,9 @@ struct fuse_conn {
/** The group id for this mount */
kgid_t group_id;

+ /** The user namespace for this mount */
+ struct user_namespace *user_ns;
+
/** The pid namespace for this mount */
struct pid_namespace *pid_ns;

@@ -713,11 +717,11 @@ void fuse_init_symlink(struct inode *inode);
/**
* Change attributes of an inode
*/
-void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
- u64 attr_valid, u64 attr_version);
+int fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
+ u64 attr_valid, u64 attr_version);

-void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
- u64 attr_valid);
+int fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
+ u64 attr_valid);

/**
* Initialize the client device
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e137969815a3..b88b5a780228 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -155,11 +155,22 @@ static ino_t fuse_squash_ino(u64 ino64)
return ino;
}

-void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
- u64 attr_valid)
+int fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
+ u64 attr_valid)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
+ kuid_t uid;
+ kgid_t gid;
+
+ uid = make_kuid(fc->user_ns, attr->uid);
+ gid = make_kgid(fc->user_ns, attr->gid);
+ if (!uid_valid(uid) || !gid_valid(gid)) {
+ make_bad_inode(inode);
+ return -EIO;
+ }
+ inode->i_uid = uid;
+ inode->i_gid = gid;

fi->attr_version = ++fc->attr_version;
fi->i_time = attr_valid;
@@ -167,8 +178,6 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_ino = fuse_squash_ino(attr->ino);
inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
set_nlink(inode, attr->nlink);
- inode->i_uid = make_kuid(&init_user_ns, attr->uid);
- inode->i_gid = make_kgid(&init_user_ns, attr->gid);
inode->i_blocks = attr->blocks;
inode->i_atime.tv_sec = attr->atime;
inode->i_atime.tv_nsec = attr->atimensec;
@@ -195,26 +204,32 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_mode &= ~S_ISVTX;

fi->orig_ino = attr->ino;
+ return 0;
}

-void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
- u64 attr_valid, u64 attr_version)
+int fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
+ u64 attr_valid, u64 attr_version)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
bool is_wb = fc->writeback_cache;
loff_t oldsize;
struct timespec old_mtime;
+ int err;

spin_lock(&fc->lock);
if ((attr_version != 0 && fi->attr_version > attr_version) ||
test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
spin_unlock(&fc->lock);
- return;
+ return 0;
}

old_mtime = inode->i_mtime;
- fuse_change_attributes_common(inode, attr, attr_valid);
+ err = fuse_change_attributes_common(inode, attr, attr_valid);
+ if (err) {
+ spin_unlock(&fc->lock);
+ return err;
+ }

oldsize = inode->i_size;
/*
@@ -249,6 +264,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
if (inval)
invalidate_inode_pages2(inode->i_mapping);
}
+
+ return 0;
}

static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
@@ -302,7 +319,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
retry:
inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
if (!inode)
- return NULL;
+ return ERR_PTR(-ENOMEM);

if ((inode->i_state & I_NEW)) {
inode->i_flags |= S_NOATIME;
@@ -319,11 +336,23 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
goto retry;
}

+ /*
+ * Must do this before incrementing nlookup, as the caller will
+ * send a forget for the node if this function fails.
+ */
+ if (fuse_change_attributes(inode, attr, attr_valid, attr_version)) {
+ /*
+ * inode_make_bad() already called by
+ * fuse_change_attributes()
+ */
+ iput(inode);
+ return ERR_PTR(-EIO);
+ }
+
fi = get_fuse_inode(inode);
spin_lock(&fc->lock);
fi->nlookup++;
spin_unlock(&fc->lock);
- fuse_change_attributes(inode, attr, attr_valid, attr_version);

return inode;
}
@@ -496,6 +525,8 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
memset(d, 0, sizeof(struct fuse_mount_data));
d->max_read = ~0;
d->blksize = FUSE_DEFAULT_BLKSIZE;
+ d->user_id = make_kuid(current_user_ns(), 0);
+ d->group_id = make_kgid(current_user_ns(), 0);

while ((p = strsep(&opt, ",")) != NULL) {
int token;
@@ -578,8 +609,10 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
struct super_block *sb = root->d_sb;
struct fuse_conn *fc = get_fuse_conn_super(sb);

- seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
- seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
+ seq_printf(m, ",user_id=%u",
+ from_kuid_munged(fc->user_ns, fc->user_id));
+ seq_printf(m, ",group_id=%u",
+ from_kgid_munged(fc->user_ns, fc->group_id));
if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
seq_puts(m, ",default_permissions");
if (fc->flags & FUSE_ALLOW_OTHER)
@@ -618,6 +651,7 @@ void fuse_conn_init(struct fuse_conn *fc)
fc->attr_version = 1;
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
+ fc->user_ns = get_user_ns(current_user_ns());
}
EXPORT_SYMBOL_GPL(fuse_conn_init);

@@ -626,6 +660,8 @@ void fuse_conn_put(struct fuse_conn *fc)
if (atomic_dec_and_test(&fc->count)) {
if (fc->destroy_req)
fuse_request_free(fc->destroy_req);
+ put_user_ns(fc->user_ns);
+ fc->user_ns = NULL;
put_pid_ns(fc->pid_ns);
fc->pid_ns = NULL;
fc->release(fc);
@@ -643,12 +679,15 @@ EXPORT_SYMBOL_GPL(fuse_conn_get);
static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
{
struct fuse_attr attr;
+ struct inode *inode;
+
memset(&attr, 0, sizeof(attr));

attr.mode = mode;
attr.ino = FUSE_ROOT_ID;
attr.nlink = 1;
- return fuse_iget(sb, 1, 0, &attr, 0, 0);
+ inode = fuse_iget(sb, 1, 0, &attr, 0, 0);
+ return IS_ERR(inode) ? NULL : inode;
}

struct fuse_inode_handle {
@@ -1043,8 +1082,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (!file)
goto err;

- if ((file->f_op != &fuse_dev_operations) ||
- (file->f_cred->user_ns != &init_user_ns))
+ /*
+ * Require mount to happen from the same user namespace which
+ * opened /dev/fuse to prevent potential attacks.
+ */
+ if (file->f_op != &fuse_dev_operations ||
+ file->f_cred->user_ns != current_user_ns())
goto err_fput;

fc = kmalloc(sizeof(*fc), GFP_KERNEL);

--
2.1.0

Andy Lutomirski

2014-10-15 14:49:39 UTC

Post by Seth Forshee
Update fuse to translate uids and gids to/from the user namspace
of the process servicing requests on /dev/fuse. Any ids which do
not map into the namespace will result in errors. inodes will
also be marked bad when unmappable ids are received from
userspace.
Due to security concerns the namespace used should be fixed,
otherwise a user might be able to gain elevated privileges or
influence processes that the user would otherwise be unable to
manipulate. Thus the namespace of the mounting process is used
for all translations, and this namespace is required to be the
same as the one in use when /dev/fuse was opened.

I'm not sure that this is necessary if my nosuid patch goes in, but I
also don't think it makes any sense to hold this up while we find a
perfect solution.

Is there a decent way to extend this to different translation schemes in
the future (e.g. a flag at fs setup that could be used)?

--Andy

Post by Seth Forshee
---
fs/fuse/dev.c | 4 +--
fs/fuse/dir.c | 81 ++++++++++++++++++++++++++++++++++++--------------------
fs/fuse/fuse_i.h | 12 ++++++---
fs/fuse/inode.c | 73 +++++++++++++++++++++++++++++++++++++++-----------
4 files changed, 121 insertions(+), 49 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 839caebd34f1..03c8785ed731 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -127,8 +127,8 @@ static void __fuse_put_request(struct fuse_req *req)
static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req)
{
- req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
- req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
+ req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
+ req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index de1d84af9f7c..123db1e06c78 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -253,9 +253,12 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
goto invalid;
- fuse_change_attributes(inode, &outarg.attr,
- entry_attr_timeout(&outarg),
- attr_version);
+ err = fuse_change_attributes(inode, &outarg.attr,
+ entry_attr_timeout(&outarg),
+ attr_version);
+ if (err)
+ goto invalid;
+
fuse_change_entry_timeout(entry, &outarg);
} else if (inode) {
fi = get_fuse_inode(inode);
@@ -340,8 +343,9 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
&outarg->attr, entry_attr_timeout(outarg),
attr_version);
- err = -ENOMEM;
- if (!*inode) {
+ if (IS_ERR(*inode)) {
+ err = PTR_ERR(*inode);
+ *inode = NULL;
fuse_queue_forget(fc, forget, outarg->nodeid, 1);
goto out;
}
@@ -473,11 +477,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
ff->open_flags = outopen.open_flags;
inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
&outentry.attr, entry_attr_timeout(&outentry), 0);
- if (!inode) {
+ if (IS_ERR(inode)) {
flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
fuse_sync_release(ff, flags);
fuse_queue_forget(fc, forget, outentry.nodeid, 1);
- err = -ENOMEM;
+ err = PTR_ERR(inode);
goto out_err;
}
kfree(forget);
@@ -588,9 +592,9 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
&outarg.attr, entry_attr_timeout(&outarg), 0);
- if (!inode) {
+ if (IS_ERR(inode)) {
fuse_queue_forget(fc, forget, outarg.nodeid, 1);
- return -ENOMEM;
+ return PTR_ERR(inode);
}
kfree(forget);
@@ -905,8 +909,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
stat->ino = attr->ino;
stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
stat->nlink = attr->nlink;
- stat->uid = make_kuid(&init_user_ns, attr->uid);
- stat->gid = make_kgid(&init_user_ns, attr->gid);
+ stat->uid = inode->i_uid;
+ stat->gid = inode->i_gid;
stat->rdev = inode->i_rdev;
stat->atime.tv_sec = attr->atime;
stat->atime.tv_nsec = attr->atimensec;
@@ -969,10 +973,10 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
make_bad_inode(inode);
err = -EIO;
} else {
- fuse_change_attributes(inode, &outarg.attr,
- attr_timeout(&outarg),
- attr_version);
- if (stat)
+ err = fuse_change_attributes(inode, &outarg.attr,
+ attr_timeout(&outarg),
+ attr_version);
+ if (!err && stat)
fuse_fillattr(inode, &outarg.attr, stat);
}
}
@@ -1302,9 +1306,11 @@ static int fuse_direntplus_link(struct file *file,
fi->nlookup++;
spin_unlock(&fc->lock);
- fuse_change_attributes(inode, &o->attr,
- entry_attr_timeout(o),
- attr_version);
+ err = fuse_change_attributes(inode, &o->attr,
+ entry_attr_timeout(o),
+ attr_version);
+ if (err)
+ goto out;
/*
* The other branch to 'found' comes via fuse_iget()
@@ -1322,8 +1328,10 @@ static int fuse_direntplus_link(struct file *file,
inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
&o->attr, entry_attr_timeout(o), attr_version);
- if (!inode)
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
goto out;
+ }
alias = d_materialise_unique(dentry, inode);
err = PTR_ERR(alias);
@@ -1556,17 +1564,25 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
return true;
}
-static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
- bool trust_local_cmtime)
+static int iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr,
+ struct fuse_setattr_in *arg, bool trust_local_cmtime)
{
unsigned ivalid = iattr->ia_valid;
if (ivalid & ATTR_MODE)
arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode;
- if (ivalid & ATTR_UID)
- arg->valid |= FATTR_UID, arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);
- if (ivalid & ATTR_GID)
- arg->valid |= FATTR_GID, arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);
+ if (ivalid & ATTR_UID) {
+ arg->uid = from_kuid(fc->user_ns, iattr->ia_uid);
+ if (arg->uid == (uid_t)-1)
+ return -EINVAL;
+ arg->valid |= FATTR_UID;
+ }
+ if (ivalid & ATTR_GID) {
+ arg->gid = from_kgid(fc->user_ns, iattr->ia_gid);
+ if (arg->gid == (gid_t)-1)
+ return -EINVAL;
+ arg->valid |= FATTR_GID;
+ }
if (ivalid & ATTR_SIZE)
arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size;
if (ivalid & ATTR_ATIME) {
@@ -1588,6 +1604,8 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
arg->ctime = iattr->ia_ctime.tv_sec;
arg->ctimensec = iattr->ia_ctime.tv_nsec;
}
+
+ return 0;
}
/*
@@ -1741,7 +1759,9 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
- iattr_to_fattr(attr, &inarg, trust_local_cmtime);
+ err = iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime);
+ if (err)
+ goto error;
if (file) {
struct fuse_file *ff = file->private_data;
inarg.valid |= FATTR_FH;
@@ -1778,8 +1798,13 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
/* FIXME: clear I_DIRTY_SYNC? */
}
- fuse_change_attributes_common(inode, &outarg.attr,
- attr_timeout(&outarg));
+ err = fuse_change_attributes_common(inode, &outarg.attr,
+ attr_timeout(&outarg));
+ if (err) {
+ spin_unlock(&fc->lock);
+ goto error;
+ }
+
oldsize = inode->i_size;
/* see the comment in fuse_change_attributes() */
if (!is_wb || is_truncate || !S_ISREG(inode->i_mode))
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index a3ded071e2c6..81187ba04e4a 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -22,6 +22,7 @@
#include <linux/rbtree.h>
#include <linux/poll.h>
#include <linux/workqueue.h>
+#include <linux/user_namespace.h>
#include <linux/pid_namespace.h>
/** Max number of pages that can be used in a single read request */
@@ -387,6 +388,9 @@ struct fuse_conn {
/** The group id for this mount */
kgid_t group_id;
+ /** The user namespace for this mount */
+ struct user_namespace *user_ns;
+
/** The pid namespace for this mount */
struct pid_namespace *pid_ns;
@@ -713,11 +717,11 @@ void fuse_init_symlink(struct inode *inode);
/**
* Change attributes of an inode
*/
-void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
- u64 attr_valid, u64 attr_version);
+int fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
+ u64 attr_valid, u64 attr_version);
-void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
- u64 attr_valid);
+int fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
+ u64 attr_valid);
/**
* Initialize the client device
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e137969815a3..b88b5a780228 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -155,11 +155,22 @@ static ino_t fuse_squash_ino(u64 ino64)
return ino;
}
-void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
- u64 attr_valid)
+int fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
+ u64 attr_valid)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
+ kuid_t uid;
+ kgid_t gid;
+
+ uid = make_kuid(fc->user_ns, attr->uid);
+ gid = make_kgid(fc->user_ns, attr->gid);
+ if (!uid_valid(uid) || !gid_valid(gid)) {
+ make_bad_inode(inode);
+ return -EIO;
+ }
+ inode->i_uid = uid;
+ inode->i_gid = gid;
fi->attr_version = ++fc->attr_version;
fi->i_time = attr_valid;
@@ -167,8 +178,6 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_ino = fuse_squash_ino(attr->ino);
inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
set_nlink(inode, attr->nlink);
- inode->i_uid = make_kuid(&init_user_ns, attr->uid);
- inode->i_gid = make_kgid(&init_user_ns, attr->gid);
inode->i_blocks = attr->blocks;
inode->i_atime.tv_sec = attr->atime;
inode->i_atime.tv_nsec = attr->atimensec;
@@ -195,26 +204,32 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_mode &= ~S_ISVTX;
fi->orig_ino = attr->ino;
+ return 0;
}
-void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
- u64 attr_valid, u64 attr_version)
+int fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
+ u64 attr_valid, u64 attr_version)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
bool is_wb = fc->writeback_cache;
loff_t oldsize;
struct timespec old_mtime;
+ int err;
spin_lock(&fc->lock);
if ((attr_version != 0 && fi->attr_version > attr_version) ||
test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
spin_unlock(&fc->lock);
- return;
+ return 0;
}
old_mtime = inode->i_mtime;
- fuse_change_attributes_common(inode, attr, attr_valid);
+ err = fuse_change_attributes_common(inode, attr, attr_valid);
+ if (err) {
+ spin_unlock(&fc->lock);
+ return err;
+ }
oldsize = inode->i_size;
/*
@@ -249,6 +264,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
if (inval)
invalidate_inode_pages2(inode->i_mapping);
}
+
+ return 0;
}
static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
@@ -302,7 +319,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
if (!inode)
- return NULL;
+ return ERR_PTR(-ENOMEM);
if ((inode->i_state & I_NEW)) {
inode->i_flags |= S_NOATIME;
@@ -319,11 +336,23 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
goto retry;
}
+ /*
+ * Must do this before incrementing nlookup, as the caller will
+ * send a forget for the node if this function fails.
+ */
+ if (fuse_change_attributes(inode, attr, attr_valid, attr_version)) {
+ /*
+ * inode_make_bad() already called by
+ * fuse_change_attributes()
+ */
+ iput(inode);
+ return ERR_PTR(-EIO);
+ }
+
fi = get_fuse_inode(inode);
spin_lock(&fc->lock);
fi->nlookup++;
spin_unlock(&fc->lock);
- fuse_change_attributes(inode, attr, attr_valid, attr_version);
return inode;
}
@@ -496,6 +525,8 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
memset(d, 0, sizeof(struct fuse_mount_data));
d->max_read = ~0;
d->blksize = FUSE_DEFAULT_BLKSIZE;
+ d->user_id = make_kuid(current_user_ns(), 0);
+ d->group_id = make_kgid(current_user_ns(), 0);
while ((p = strsep(&opt, ",")) != NULL) {
int token;
@@ -578,8 +609,10 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
struct super_block *sb = root->d_sb;
struct fuse_conn *fc = get_fuse_conn_super(sb);
- seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
- seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
+ seq_printf(m, ",user_id=%u",
+ from_kuid_munged(fc->user_ns, fc->user_id));
+ seq_printf(m, ",group_id=%u",
+ from_kgid_munged(fc->user_ns, fc->group_id));
if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
seq_puts(m, ",default_permissions");
if (fc->flags & FUSE_ALLOW_OTHER)
@@ -618,6 +651,7 @@ void fuse_conn_init(struct fuse_conn *fc)
fc->attr_version = 1;
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
+ fc->user_ns = get_user_ns(current_user_ns());
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
@@ -626,6 +660,8 @@ void fuse_conn_put(struct fuse_conn *fc)
if (atomic_dec_and_test(&fc->count)) {
if (fc->destroy_req)
fuse_request_free(fc->destroy_req);
+ put_user_ns(fc->user_ns);
+ fc->user_ns = NULL;
put_pid_ns(fc->pid_ns);
fc->pid_ns = NULL;
fc->release(fc);
@@ -643,12 +679,15 @@ EXPORT_SYMBOL_GPL(fuse_conn_get);
static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
{
struct fuse_attr attr;
+ struct inode *inode;
+
memset(&attr, 0, sizeof(attr));
attr.mode = mode;
attr.ino = FUSE_ROOT_ID;
attr.nlink = 1;
- return fuse_iget(sb, 1, 0, &attr, 0, 0);
+ inode = fuse_iget(sb, 1, 0, &attr, 0, 0);
+ return IS_ERR(inode) ? NULL : inode;
}
struct fuse_inode_handle {
@@ -1043,8 +1082,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (!file)
goto err;
- if ((file->f_op != &fuse_dev_operations) ||
- (file->f_cred->user_ns != &init_user_ns))
+ /*
+ * Require mount to happen from the same user namespace which
+ * opened /dev/fuse to prevent potential attacks.
+ */
+ if (file->f_op != &fuse_dev_operations ||
+ file->f_cred->user_ns != current_user_ns())
goto err_fput;
fc = kmalloc(sizeof(*fc), GFP_KERNEL);

Seth Forshee

2014-10-15 15:05:12 UTC

I'm not sure that this is necessary if my nosuid patch goes in, but I
also don't think it makes any sense to hold this up while we find a
perfect solution.
Is there a decent way to extend this to different translation schemes in
the future (e.g. a flag at fs setup that could be used)?

I think it would be possible to relax the translation scheme
restrictions in the future, certainly that's easier than tightening down
a looser restriction. I still favor picking one namespace to use for
translation (surely that's how it would work with other filesystems
anyway) rather than using the current namespace during /dev/fuse I/O. I
did an implementation using the latter technique, and it's far more
complex with no benefits that I can see.

Thanks,
Seth
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

Andy Lutomirski

2014-10-15 17:05:46 UTC

On Wed, Oct 15, 2014 at 8:05 AM, Seth Forshee

I'm not sure that this is necessary if my nosuid patch goes in, but I
also don't think it makes any sense to hold this up while we find a
perfect solution.
Is there a decent way to extend this to different translation schemes in
the future (e.g. a flag at fs setup that could be used)?

Long term, I think we'll want more flexible translations for
filesystems on removable media, even when both the mounter and the
accessing process are in the init user namespace. But this can wait.

--Andy

Post by Seth Forshee
Thanks,
Seth

--
Andy Lutomirski
AMA Capital Management, LLC

Seth Forshee

2014-10-15 22:59:00 UTC

Post by Andy Lutomirski
On Wed, Oct 15, 2014 at 8:05 AM, Seth Forshee

I'm not sure that this is necessary if my nosuid patch goes in, but I
also don't think it makes any sense to hold this up while we find a
perfect solution.
Is there a decent way to extend this to different translation schemes in
the future (e.g. a flag at fs setup that could be used)?

Long term, I think we'll want more flexible translations for
filesystems on removable media, even when both the mounter and the
accessing process are in the init user namespace. But this can wait.

You've piqued my interest. What are you thinking of which would require
this flexibility?

Thanks,
Seth

Andy Lutomirski

2014-10-15 23:07:34 UTC

On Wed, Oct 15, 2014 at 3:59 PM, Seth Forshee

Post by Andy Lutomirski
On Wed, Oct 15, 2014 at 8:05 AM, Seth Forshee

I'm not sure that this is necessary if my nosuid patch goes in, but I
also don't think it makes any sense to hold this up while we find a
perfect solution.
Is there a decent way to extend this to different translation schemes in
the future (e.g. a flag at fs setup that could be used)?

Long term, I think we'll want more flexible translations for
filesystems on removable media, even when both the mounter and the
accessing process are in the init user namespace. But this can wait.

You've piqued my interest. What are you thinking of which would require
this flexibility?

For an easy example, if I stick a USB stick into my computer and copy
a file to it, I probably want the file to be owned by uid 0 in the FS
metadata (but still owned by me as reported by stat(2) and friends).

For a more complex example, tools like Sandstorm (http://sandstorm.io)
probably want to use FUSE mounted by an outer (non-root) userns and
accessed from an inner userns. With your patches, this *might* work,
but it might also be a little tricky.

I can also see this ability being extremely useful for NFS and other
network filesystems, where keeping all the uids in sync is currently a
royal PITA.

--Andy

Seth Forshee

2014-10-15 23:24:20 UTC

Post by Andy Lutomirski
On Wed, Oct 15, 2014 at 3:59 PM, Seth Forshee

Post by Andy Lutomirski
On Wed, Oct 15, 2014 at 8:05 AM, Seth Forshee

I'm not sure that this is necessary if my nosuid patch goes in, but I
also don't think it makes any sense to hold this up while we find a
perfect solution.
Is there a decent way to extend this to different translation schemes in
the future (e.g. a flag at fs setup that could be used)?

Long term, I think we'll want more flexible translations for
filesystems on removable media, even when both the mounter and the
accessing process are in the init user namespace. But this can wait.

You've piqued my interest. What are you thinking of which would require
this flexibility?

For an easy example, if I stick a USB stick into my computer and copy
a file to it, I probably want the file to be owned by uid 0 in the FS
metadata (but still owned by me as reported by stat(2) and friends).
For a more complex example, tools like Sandstorm (http://sandstorm.io)
probably want to use FUSE mounted by an outer (non-root) userns and
accessed from an inner userns. With your patches, this *might* work,
but it might also be a little tricky.

This at least should work fine with my patches so long as the fuse mount
has the allow_other option the inner userns is a descendant of the outer
ns. I don't think there's anything tricky, though I do suspect you'll
also want the default_permissions option.

Thanks,
Seth

Post by Andy Lutomirski
I can also see this ability being extremely useful for NFS and other
network filesystems, where keeping all the uids in sync is currently a
royal PITA.
--Andy

Seth Forshee

2014-10-14 14:25:56 UTC

Cc: Eric W. Biederman <***@xmission.com>
Cc: Serge H. Hallyn <***@ubuntu.com>
Signed-off-by: Seth Forshee <***@canonical.com>
---
fs/fuse/inode.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 5e00a6a76049..6522926b14e4 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1212,7 +1212,7 @@ static void fuse_kill_sb_anon(struct super_block *sb)
static struct file_system_type fuse_fs_type = {
.owner = THIS_MODULE,
.name = "fuse",
- .fs_flags = FS_HAS_SUBTYPE,
+ .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT,
.mount = fuse_mount,
.kill_sb = fuse_kill_sb_anon,
};
@@ -1244,7 +1244,7 @@ static struct file_system_type fuseblk_fs_type = {
.name = "fuseblk",
.mount = fuse_mount_blk,
.kill_sb = fuse_kill_sb_blk,
- .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
+ .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE | FS_USERNS_MOUNT,
};
MODULE_ALIAS_FS("fuseblk");

--
2.1.0

Andy Lutomirski

2014-10-15 14:58:53 UTC

Post by Seth Forshee
---
fs/fuse/inode.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 5e00a6a76049..6522926b14e4 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1212,7 +1212,7 @@ static void fuse_kill_sb_anon(struct super_block *sb)
static struct file_system_type fuse_fs_type = {
.owner = THIS_MODULE,
.name = "fuse",
- .fs_flags = FS_HAS_SUBTYPE,
+ .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT,
.mount = fuse_mount,
.kill_sb = fuse_kill_sb_anon,
};
@@ -1244,7 +1244,7 @@ static struct file_system_type fuseblk_fs_type = {
.name = "fuseblk",
.mount = fuse_mount_blk,
.kill_sb = fuse_kill_sb_blk,
- .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
+ .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE | FS_USERNS_MOUNT,

I think it's decision time -- if these patches are applied, then FUSE
will be the first filesystem for which userns nodev behavior matters for
security, so applying this patch will enshrine an API decision.

I would very much prefer to make this patch depend on this:

http://lkml.kernel.org/g/***@amacapital.net

That change will require that anyone who tries to mount one of these
things explicitly requests MS_NODEV instead of keeping the current
behavior of implicitly setting MS_NODEV and possibly confusing user code
that tries to remount.

If you like my patch, feel free to fold it in to your series, or Eric
can apply it directly (pretty please).

For background, with your patches as is, if you mount a FUSE fs and then
remount it with identical flags, the remount is likely to fail.

--Andy

Post by Seth Forshee
};
MODULE_ALIAS_FS("fuseblk");

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

Seth Forshee

2014-10-15 15:20:13 UTC

I think it's decision time -- if these patches are applied, then FUSE
will be the first filesystem for which userns nodev behavior matters for
security, so applying this patch will enshrine an API decision.
That change will require that anyone who tries to mount one of these
things explicitly requests MS_NODEV instead of keeping the current
behavior of implicitly setting MS_NODEV and possibly confusing user code
that tries to remount.
If you like my patch, feel free to fold it in to your series, or Eric
can apply it directly (pretty please).
For background, with your patches as is, if you mount a FUSE fs and then
remount it with identical flags, the remount is likely to fail.

I discussed this with Eric during LinuxCon NA ... as I recall he was
undecided about whether or not to use your patch at the time. I do
prefer an explicit failure over implicitly adding MS_NODEV, but it's not
up to me. I do agree though that we should make a decision before
merging the fuse patches, I was just assuming that the decision was
already made.

Thanks,
Seth

Andy Lutomirski

2014-10-15 23:08:54 UTC

On Wed, Oct 15, 2014 at 8:20 AM, Seth Forshee

I think it's decision time -- if these patches are applied, then FUSE
will be the first filesystem for which userns nodev behavior matters for
security, so applying this patch will enshrine an API decision.
That change will require that anyone who tries to mount one of these
things explicitly requests MS_NODEV instead of keeping the current
behavior of implicitly setting MS_NODEV and possibly confusing user code
that tries to remount.
If you like my patch, feel free to fold it in to your series, or Eric
can apply it directly (pretty please).
For background, with your patches as is, if you mount a FUSE fs and then
remount it with identical flags, the remount is likely to fail.

As far as I know, no decision has been made. I discussed it with Eric
at LinuxCon NA, too. Too bad we didn't meet there.

Hopefully your patches will convince him to ack my patch :)

--Andy

Seth Forshee

2014-10-15 23:07:01 UTC

I think it's decision time -- if these patches are applied, then FUSE
will be the first filesystem for which userns nodev behavior matters for
security, so applying this patch will enshrine an API decision.
That change will require that anyone who tries to mount one of these
things explicitly requests MS_NODEV instead of keeping the current
behavior of implicitly setting MS_NODEV and possibly confusing user code
that tries to remount.
If you like my patch, feel free to fold it in to your series, or Eric
can apply it directly (pretty please).
For background, with your patches as is, if you mount a FUSE fs and then
remount it with identical flags, the remount is likely to fail.

(Resending my response since I still don't see it on lkml after 7+
hours)

I discussed this with Eric during LinuxCon NA ... as I recall he was
undecided about whether or not to use your patch at the time. I do
prefer an explicit failure over implicitly adding MS_NODEV, but it's not
up to me. I do agree though that we should make a decision before
merging the fuse patches, I was just assuming that the decision was
already made.

Thanks,
Seth

Seth Forshee

2014-10-14 14:25:54 UTC

Unprivileged users are normally restricted from mounting with the
allow_other option by system policy, but this could be bypassed
for a mount done with user namespace root permissions. In such
cases allow_other should not allow users outside the user
namespace to access the mount as doing so would give the
unprivileged user the ability to manipulate processes it would
otherwise be unable to manipulate.

Cc: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/***@public.gmane.org>
Cc: Serge H. Hallyn <serge.hallyn-GeWIH/nMZzLQT0dZR+***@public.gmane.org>
Signed-off-by: Seth Forshee <seth.forshee-Z7WLFzj8eWMS+***@public.gmane.org>
---
fs/fuse/dir.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 123db1e06c78..e3123bfbc711 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1089,12 +1089,20 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
*/
int fuse_allow_current_process(struct fuse_conn *fc)
{
- const struct cred *cred;
+ const struct cred *cred = current_cred();

- if (fc->flags & FUSE_ALLOW_OTHER)
- return 1;
+ if (fc->flags & FUSE_ALLOW_OTHER) {
+ if (kuid_has_mapping(fc->user_ns, cred->euid) &&
+ kuid_has_mapping(fc->user_ns, cred->suid) &&
+ kuid_has_mapping(fc->user_ns, cred->uid) &&
+ kgid_has_mapping(fc->user_ns, cred->egid) &&
+ kgid_has_mapping(fc->user_ns, cred->sgid) &&
+ kgid_has_mapping(fc->user_ns, cred->gid))
+ return 1;
+
+ return 0;
+ }

- cred = current_cred();
if (uid_eq(cred->euid, fc->user_id) &&
uid_eq(cred->suid, fc->user_id) &&
uid_eq(cred->uid, fc->user_id) &&

--
2.1.0

Andy Lutomirski

2014-10-15 14:58:59 UTC

Post by Seth Forshee
Unprivileged users are normally restricted from mounting with the
allow_other option by system policy, but this could be bypassed
for a mount done with user namespace root permissions. In such
cases allow_other should not allow users outside the user
namespace to access the mount as doing so would give the
unprivileged user the ability to manipulate processes it would
otherwise be unable to manipulate.

What threat is this intended to protect against? I think that, if this
is needed, tasks outside the userns or its descendents should be
blocked, even if the user ids match. That is, I think you should check
the namespace, not the uid and gid.

--Andy

Post by Seth Forshee
---
fs/fuse/dir.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 123db1e06c78..e3123bfbc711 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1089,12 +1089,20 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
*/
int fuse_allow_current_process(struct fuse_conn *fc)
{
- const struct cred *cred;
+ const struct cred *cred = current_cred();
- if (fc->flags & FUSE_ALLOW_OTHER)
- return 1;
+ if (fc->flags & FUSE_ALLOW_OTHER) {
+ if (kuid_has_mapping(fc->user_ns, cred->euid) &&
+ kuid_has_mapping(fc->user_ns, cred->suid) &&
+ kuid_has_mapping(fc->user_ns, cred->uid) &&
+ kgid_has_mapping(fc->user_ns, cred->egid) &&
+ kgid_has_mapping(fc->user_ns, cred->sgid) &&
+ kgid_has_mapping(fc->user_ns, cred->gid))
+ return 1;
+
+ return 0;
+ }
- cred = current_cred();
if (uid_eq(cred->euid, fc->user_id) &&
uid_eq(cred->suid, fc->user_id) &&
uid_eq(cred->uid, fc->user_id) &&

Seth Forshee

2014-10-15 15:11:00 UTC

allow_other is an existing option in fuse to protect against DoS
attacks against more privileged users by making file operations block
indefinitely. So this change makes it work the same way inside a user
namespace but only to users mapped into the namespace. Checking the
namespace does seem to make more sense, so I'll make that change.

Thanks,
Seth

Seth Forshee

2014-10-14 14:25:52 UTC