From 53db28933e952a8536b002ba8b8c9443ccc0e939 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 25 Nov 2021 14:05:18 +0100 Subject: [PATCH 01/10] fuse: extend init flags FUSE_INIT flags are close to running out, so add another 32bits worth of space. Add FUSE_INIT_EXT flag to the old flags field in fuse_init_in. If this flag is set, then fuse_init_in is extended by 48bytes, in which a flags_hi field is allocated to contain the high 32bits of the flags. A flags_hi field is also added to fuse_init_out, allocated out of the remaining unused fields. Known userspace implementations of the fuse protocol have been checked to accept the extended FUSE_INIT request, but this might cause problems with other implementations. If that happens to be the case, the protocol negotiation will have to be extended with an extra initialization request roundtrip. Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 60 +++++++++++++++++++++------------------ include/uapi/linux/fuse.h | 16 +++++++++-- 2 files changed, 47 insertions(+), 29 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8b89e3ba7df3..5a1dad8c1f92 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1109,72 +1109,74 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, process_init_limits(fc, arg); if (arg->minor >= 6) { + u64 flags = arg->flags | (u64) arg->flags2 << 32; + ra_pages = arg->max_readahead / PAGE_SIZE; - if (arg->flags & FUSE_ASYNC_READ) + if (flags & FUSE_ASYNC_READ) fc->async_read = 1; - if (!(arg->flags & FUSE_POSIX_LOCKS)) + if (!(flags & FUSE_POSIX_LOCKS)) fc->no_lock = 1; if (arg->minor >= 17) { - if (!(arg->flags & FUSE_FLOCK_LOCKS)) + if (!(flags & FUSE_FLOCK_LOCKS)) fc->no_flock = 1; } else { - if (!(arg->flags & FUSE_POSIX_LOCKS)) + if (!(flags & FUSE_POSIX_LOCKS)) fc->no_flock = 1; } - if (arg->flags & FUSE_ATOMIC_O_TRUNC) + if (flags & FUSE_ATOMIC_O_TRUNC) fc->atomic_o_trunc = 1; if (arg->minor >= 9) { /* LOOKUP has dependency on proto version */ - if (arg->flags & FUSE_EXPORT_SUPPORT) + if (flags & FUSE_EXPORT_SUPPORT) fc->export_support = 1; } - if (arg->flags & FUSE_BIG_WRITES) + if (flags & FUSE_BIG_WRITES) fc->big_writes = 1; - if (arg->flags & FUSE_DONT_MASK) + if (flags & FUSE_DONT_MASK) fc->dont_mask = 1; - if (arg->flags & FUSE_AUTO_INVAL_DATA) + if (flags & FUSE_AUTO_INVAL_DATA) fc->auto_inval_data = 1; - else if (arg->flags & FUSE_EXPLICIT_INVAL_DATA) + else if (flags & FUSE_EXPLICIT_INVAL_DATA) fc->explicit_inval_data = 1; - if (arg->flags & FUSE_DO_READDIRPLUS) { + if (flags & FUSE_DO_READDIRPLUS) { fc->do_readdirplus = 1; - if (arg->flags & FUSE_READDIRPLUS_AUTO) + if (flags & FUSE_READDIRPLUS_AUTO) fc->readdirplus_auto = 1; } - if (arg->flags & FUSE_ASYNC_DIO) + if (flags & FUSE_ASYNC_DIO) fc->async_dio = 1; - if (arg->flags & FUSE_WRITEBACK_CACHE) + if (flags & FUSE_WRITEBACK_CACHE) fc->writeback_cache = 1; - if (arg->flags & FUSE_PARALLEL_DIROPS) + if (flags & FUSE_PARALLEL_DIROPS) fc->parallel_dirops = 1; - if (arg->flags & FUSE_HANDLE_KILLPRIV) + if (flags & FUSE_HANDLE_KILLPRIV) fc->handle_killpriv = 1; if (arg->time_gran && arg->time_gran <= 1000000000) fm->sb->s_time_gran = arg->time_gran; - if ((arg->flags & FUSE_POSIX_ACL)) { + if ((flags & FUSE_POSIX_ACL)) { fc->default_permissions = 1; fc->posix_acl = 1; fm->sb->s_xattr = fuse_acl_xattr_handlers; } - if (arg->flags & FUSE_CACHE_SYMLINKS) + if (flags & FUSE_CACHE_SYMLINKS) fc->cache_symlinks = 1; - if (arg->flags & FUSE_ABORT_ERROR) + if (flags & FUSE_ABORT_ERROR) fc->abort_err = 1; - if (arg->flags & FUSE_MAX_PAGES) { + if (flags & FUSE_MAX_PAGES) { fc->max_pages = min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); } if (IS_ENABLED(CONFIG_FUSE_DAX) && - arg->flags & FUSE_MAP_ALIGNMENT && + flags & FUSE_MAP_ALIGNMENT && !fuse_dax_check_alignment(fc, arg->map_alignment)) { ok = false; } - if (arg->flags & FUSE_HANDLE_KILLPRIV_V2) { + if (flags & FUSE_HANDLE_KILLPRIV_V2) { fc->handle_killpriv_v2 = 1; fm->sb->s_flags |= SB_NOSEC; } - if (arg->flags & FUSE_SETXATTR_EXT) + if (flags & FUSE_SETXATTR_EXT) fc->setxattr_ext = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; @@ -1203,13 +1205,14 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, void fuse_send_init(struct fuse_mount *fm) { struct fuse_init_args *ia; + u64 flags; ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL); ia->in.major = FUSE_KERNEL_VERSION; ia->in.minor = FUSE_KERNEL_MINOR_VERSION; ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE; - ia->in.flags |= + flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | @@ -1219,13 +1222,16 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | - FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT; + FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) - ia->in.flags |= FUSE_MAP_ALIGNMENT; + flags |= FUSE_MAP_ALIGNMENT; #endif if (fm->fc->auto_submounts) - ia->in.flags |= FUSE_SUBMOUNTS; + flags |= FUSE_SUBMOUNTS; + + ia->in.flags = flags; + ia->in.flags2 = flags >> 32; ia->args.opcode = FUSE_INIT; ia->args.in_numargs = 1; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index a1dc3ee1d17c..980f3998c11b 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -187,6 +187,10 @@ * * 7.35 * - add FOPEN_NOFLUSH + * + * 7.36 + * - extend fuse_init_in with reserved fields, add FUSE_INIT_EXT init flag + * - add flags2 to fuse_init_in and fuse_init_out */ #ifndef _LINUX_FUSE_H @@ -222,7 +226,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 35 +#define FUSE_KERNEL_MINOR_VERSION 36 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -341,6 +345,8 @@ struct fuse_file_lock { * write/truncate sgid is killed only if file has group * execute permission. (Same as Linux VFS behavior). * FUSE_SETXATTR_EXT: Server supports extended struct fuse_setxattr_in + * FUSE_INIT_EXT: extended fuse_init_in request + * FUSE_INIT_RESERVED: reserved, do not use */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -372,6 +378,9 @@ struct fuse_file_lock { #define FUSE_SUBMOUNTS (1 << 27) #define FUSE_HANDLE_KILLPRIV_V2 (1 << 28) #define FUSE_SETXATTR_EXT (1 << 29) +#define FUSE_INIT_EXT (1 << 30) +#define FUSE_INIT_RESERVED (1 << 31) +/* bits 32..63 get shifted down 32 bits into the flags2 field */ /** * CUSE INIT request/reply flags @@ -741,6 +750,8 @@ struct fuse_init_in { uint32_t minor; uint32_t max_readahead; uint32_t flags; + uint32_t flags2; + uint32_t unused[11]; }; #define FUSE_COMPAT_INIT_OUT_SIZE 8 @@ -757,7 +768,8 @@ struct fuse_init_out { uint32_t time_gran; uint16_t max_pages; uint16_t map_alignment; - uint32_t unused[8]; + uint32_t flags2; + uint32_t unused[7]; }; #define CUSE_INIT_INFO_MAX 4096 From 3e2b6fdbdc9ab5a02d9d5676a005f30780b97553 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 11 Nov 2021 09:32:49 -0500 Subject: [PATCH 02/10] fuse: send security context of inode on file When a new inode is created, send its security context to server along with creation request (FUSE_CREAT, FUSE_MKNOD, FUSE_MKDIR and FUSE_SYMLINK). This gives server an opportunity to create new file and set security context (possibly atomically). In all the configurations it might not be possible to set context atomically. Like nfs and ceph, use security_dentry_init_security() to dermine security context of inode and send it with create, mkdir, mknod, and symlink requests. Following is the information sent to server. fuse_sectx_header, fuse_secctx, xattr_name, security_context - struct fuse_secctx_header This contains total number of security contexts being sent and total size of all the security contexts (including size of fuse_secctx_header). - struct fuse_secctx This contains size of security context which follows this structure. There is one fuse_secctx instance per security context. - xattr name string This string represents name of xattr which should be used while setting security context. - security context This is the actual security context whose size is specified in fuse_secctx struct. Also add the FUSE_SECURITY_CTX flag for the `flags` field of the fuse_init_out struct. When this flag is set the kernel will append the security context for a newly created inode to the request (create, mkdir, mknod, and symlink). The server is responsible for ensuring that the inode appears atomically (preferrably) with the requested security context. For example, If the server is using SELinux and backed by a "real" linux file system that supports extended attributes it can write the security context value to /proc/thread-self/attr/fscreate before making the syscall to create the inode. This patch is based on patch from Chirantan Ekbote Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 91 +++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_i.h | 3 ++ fs/fuse/inode.c | 5 ++- include/uapi/linux/fuse.h | 34 ++++++++++++++- 4 files changed, 130 insertions(+), 3 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 0654bfedcbb0..656e921f3506 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -17,6 +17,9 @@ #include #include #include +#include +#include +#include static void fuse_advise_use_readdirplus(struct inode *dir) { @@ -456,6 +459,62 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, return ERR_PTR(err); } +static int get_security_context(struct dentry *entry, umode_t mode, + void **security_ctx, u32 *security_ctxlen) +{ + struct fuse_secctx *fctx; + struct fuse_secctx_header *header; + void *ctx = NULL, *ptr; + u32 ctxlen, total_len = sizeof(*header); + int err, nr_ctx = 0; + const char *name; + size_t namelen; + + err = security_dentry_init_security(entry, mode, &entry->d_name, + &name, &ctx, &ctxlen); + if (err) { + if (err != -EOPNOTSUPP) + goto out_err; + /* No LSM is supporting this security hook. Ignore error */ + ctxlen = 0; + ctx = NULL; + } + + if (ctxlen) { + nr_ctx = 1; + namelen = strlen(name) + 1; + err = -EIO; + if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || ctxlen > S32_MAX)) + goto out_err; + total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + ctxlen); + } + + err = -ENOMEM; + header = ptr = kzalloc(total_len, GFP_KERNEL); + if (!ptr) + goto out_err; + + header->nr_secctx = nr_ctx; + header->size = total_len; + ptr += sizeof(*header); + if (nr_ctx) { + fctx = ptr; + fctx->size = ctxlen; + ptr += sizeof(*fctx); + + strcpy(ptr, name); + ptr += namelen; + + memcpy(ptr, ctx, ctxlen); + } + *security_ctxlen = total_len; + *security_ctx = header; + err = 0; +out_err: + kfree(ctx); + return err; +} + /* * Atomic create+open operation * @@ -476,6 +535,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; + void *security_ctx = NULL; + u32 security_ctxlen; /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); @@ -517,7 +578,20 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_args[0].value = &outentry; args.out_args[1].size = sizeof(outopen); args.out_args[1].value = &outopen; + + if (fm->fc->init_security) { + err = get_security_context(entry, mode, &security_ctx, + &security_ctxlen); + if (err) + goto out_put_forget_req; + + args.in_numargs = 3; + args.in_args[2].size = security_ctxlen; + args.in_args[2].value = security_ctx; + } + err = fuse_simple_request(fm, &args); + kfree(security_ctx); if (err) goto out_free_ff; @@ -620,6 +694,8 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, struct dentry *d; int err; struct fuse_forget_link *forget; + void *security_ctx = NULL; + u32 security_ctxlen; if (fuse_is_bad(dir)) return -EIO; @@ -633,7 +709,22 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, args->out_numargs = 1; args->out_args[0].size = sizeof(outarg); args->out_args[0].value = &outarg; + + if (fm->fc->init_security && args->opcode != FUSE_LINK) { + err = get_security_context(entry, mode, &security_ctx, + &security_ctxlen); + if (err) + goto out_put_forget_req; + + BUG_ON(args->in_numargs != 2); + + args->in_numargs = 3; + args->in_args[2].size = security_ctxlen; + args->in_args[2].value = security_ctx; + } + err = fuse_simple_request(fm, args); + kfree(security_ctx); if (err) goto out_put_forget_req; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 198637b41e19..c1a8b313e6ed 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -765,6 +765,9 @@ struct fuse_conn { /* Propagate syncfs() to server */ unsigned int sync_fs:1; + /* Initialize security xattrs when creating a new inode */ + unsigned int init_security:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 5a1dad8c1f92..63ab45427de5 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1178,6 +1178,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, } if (flags & FUSE_SETXATTR_EXT) fc->setxattr_ext = 1; + if (flags & FUSE_SECURITY_CTX) + fc->init_security = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1222,7 +1224,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | - FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT; + FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | + FUSE_SECURITY_CTX; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 980f3998c11b..3f0ea63fec08 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -191,6 +191,8 @@ * 7.36 * - extend fuse_init_in with reserved fields, add FUSE_INIT_EXT init flag * - add flags2 to fuse_init_in and fuse_init_out + * - add FUSE_SECURITY_CTX init flag + * - add security context to create, mkdir, symlink, and mknod requests */ #ifndef _LINUX_FUSE_H @@ -347,6 +349,8 @@ struct fuse_file_lock { * FUSE_SETXATTR_EXT: Server supports extended struct fuse_setxattr_in * FUSE_INIT_EXT: extended fuse_init_in request * FUSE_INIT_RESERVED: reserved, do not use + * FUSE_SECURITY_CTX: add security context to create, mkdir, symlink, and + * mknod */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -381,6 +385,7 @@ struct fuse_file_lock { #define FUSE_INIT_EXT (1 << 30) #define FUSE_INIT_RESERVED (1 << 31) /* bits 32..63 get shifted down 32 bits into the flags2 field */ +#define FUSE_SECURITY_CTX (1ULL << 32) /** * CUSE INIT request/reply flags @@ -877,9 +882,12 @@ struct fuse_dirent { char name[]; }; -#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name) -#define FUSE_DIRENT_ALIGN(x) \ +/* Align variable length records to 64bit boundary */ +#define FUSE_REC_ALIGN(x) \ (((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1)) + +#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name) +#define FUSE_DIRENT_ALIGN(x) FUSE_REC_ALIGN(x) #define FUSE_DIRENT_SIZE(d) \ FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen) @@ -996,4 +1004,26 @@ struct fuse_syncfs_in { uint64_t padding; }; +/* + * For each security context, send fuse_secctx with size of security context + * fuse_secctx will be followed by security context name and this in turn + * will be followed by actual context label. + * fuse_secctx, name, context + */ +struct fuse_secctx { + uint32_t size; + uint32_t padding; +}; + +/* + * Contains the information about how many fuse_secctx structures are being + * sent and what's the total size of all security contexts (including + * size of fuse_secctx_header). + * + */ +struct fuse_secctx_header { + uint32_t size; + uint32_t nr_secctx; +}; + #endif /* _LINUX_FUSE_H */ From e388164ea385f04666c4633f5dc4f951fca71890 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Mon, 22 Nov 2021 17:05:31 +0800 Subject: [PATCH 03/10] fuse: Pass correct lend value to filemap_write_and_wait_range() The acceptable maximum value of lend parameter in filemap_write_and_wait_range() is LLONG_MAX rather than -1. And there is also some logic depending on LLONG_MAX check in write_cache_pages(). So let's pass LLONG_MAX to filemap_write_and_wait_range() in fuse_writeback_range() instead. Fixes: 59bda8ecee2f ("fuse: flush extending writes") Signed-off-by: Xie Yongji Cc: # v5.15 Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9d6c5f6361f7..df81768c81a7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2910,7 +2910,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) { - int err = filemap_write_and_wait_range(inode->i_mapping, start, -1); + int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX); if (!err) fuse_sync_writes(inode); From cecd491641c23f3c63958a62efb74cdaf3c93d7b Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 25 Nov 2021 15:05:24 +0800 Subject: [PATCH 04/10] fuse: add fuse_should_enable_dax() helper This is in prep for following per inode DAX checking. Signed-off-by: Jeffle Xu Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/dax.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 713818d74de6..8057fbf5576c 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1327,11 +1327,19 @@ static const struct address_space_operations fuse_dax_file_aops = { .invalidatepage = noop_invalidatepage, }; -void fuse_dax_inode_init(struct inode *inode) +static bool fuse_should_enable_dax(struct inode *inode) { struct fuse_conn *fc = get_fuse_conn(inode); if (!fc->dax) + return false; + + return true; +} + +void fuse_dax_inode_init(struct inode *inode) +{ + if (!fuse_should_enable_dax(inode)) return; inode->i_flags |= S_DAX; From 780b1b959f9bd959e1aca450e9fee0e2c00b31ad Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 25 Nov 2021 15:05:25 +0800 Subject: [PATCH 05/10] fuse: make DAX mount option a tri-state We add 'always', 'never', and 'inode' (default). '-o dax' continues to operate the same which is equivalent to 'always'. The following behavior is consistent with that on ext4/xfs: - The default behavior (when neither '-o dax' nor '-o dax=always|never|inode' option is specified) is equal to 'inode' mode, while 'dax=inode' won't be printed among the mount option list. - The 'inode' mode is only advisory. It will silently fallback to 'never' mode if fuse server doesn't support that. Also noted that by the time of this commit, 'inode' mode is actually equal to 'always' mode, before the per inode DAX flag is introduced in the following patch. Signed-off-by: Jeffle Xu Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/dax.c | 13 ++++++++++++- fs/fuse/fuse_i.h | 20 ++++++++++++++++++-- fs/fuse/inode.c | 10 +++++++--- fs/fuse/virtio_fs.c | 18 +++++++++++++++--- 4 files changed, 52 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 8057fbf5576c..d2bc5e7f5132 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1279,11 +1279,14 @@ out_err: return ret; } -int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev) +int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode dax_mode, + struct dax_device *dax_dev) { struct fuse_conn_dax *fcd; int err; + fc->dax_mode = dax_mode; + if (!dax_dev) return 0; @@ -1330,7 +1333,15 @@ static const struct address_space_operations fuse_dax_file_aops = { static bool fuse_should_enable_dax(struct inode *inode) { struct fuse_conn *fc = get_fuse_conn(inode); + enum fuse_dax_mode dax_mode = fc->dax_mode; + if (dax_mode == FUSE_DAX_NEVER) + return false; + + /* + * fc->dax may be NULL in 'inode' mode when filesystem device doesn't + * support DAX, in which case it will silently fallback to 'never' mode. + */ if (!fc->dax) return false; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c1a8b313e6ed..8bf133109170 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -480,6 +480,18 @@ struct fuse_dev { struct list_head entry; }; +enum fuse_dax_mode { + FUSE_DAX_INODE_DEFAULT, /* default */ + FUSE_DAX_ALWAYS, /* "-o dax=always" */ + FUSE_DAX_NEVER, /* "-o dax=never" */ + FUSE_DAX_INODE_USER, /* "-o dax=inode" */ +}; + +static inline bool fuse_is_inode_dax_mode(enum fuse_dax_mode mode) +{ + return mode == FUSE_DAX_INODE_DEFAULT || mode == FUSE_DAX_INODE_USER; +} + struct fuse_fs_context { int fd; struct file *file; @@ -497,7 +509,7 @@ struct fuse_fs_context { bool no_control:1; bool no_force_umount:1; bool legacy_opts_show:1; - bool dax:1; + enum fuse_dax_mode dax_mode; unsigned int max_read; unsigned int blksize; const char *subtype; @@ -805,6 +817,9 @@ struct fuse_conn { struct list_head devices; #ifdef CONFIG_FUSE_DAX + /* Dax mode */ + enum fuse_dax_mode dax_mode; + /* Dax specific conn data, non-NULL if DAX is enabled */ struct fuse_conn_dax *dax; #endif @@ -1272,7 +1287,8 @@ ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma); int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end); -int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev); +int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode mode, + struct dax_device *dax_dev); void fuse_dax_conn_free(struct fuse_conn *fc); bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi); void fuse_dax_inode_init(struct inode *inode); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 63ab45427de5..d83589ef11c0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -767,8 +767,12 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",blksize=%lu", sb->s_blocksize); } #ifdef CONFIG_FUSE_DAX - if (fc->dax) - seq_puts(m, ",dax"); + if (fc->dax_mode == FUSE_DAX_ALWAYS) + seq_puts(m, ",dax=always"); + else if (fc->dax_mode == FUSE_DAX_NEVER) + seq_puts(m, ",dax=never"); + else if (fc->dax_mode == FUSE_DAX_INODE_USER) + seq_puts(m, ",dax=inode"); #endif return 0; @@ -1523,7 +1527,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) sb->s_subtype = ctx->subtype; ctx->subtype = NULL; if (IS_ENABLED(CONFIG_FUSE_DAX)) { - err = fuse_dax_conn_alloc(fc, ctx->dax_dev); + err = fuse_dax_conn_alloc(fc, ctx->dax_mode, ctx->dax_dev); if (err) goto err; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 4cfa4bc1f579..e54dc069587d 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -88,12 +88,21 @@ struct virtio_fs_req_work { static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, struct fuse_req *req, bool in_flight); +static const struct constant_table dax_param_enums[] = { + {"always", FUSE_DAX_ALWAYS }, + {"never", FUSE_DAX_NEVER }, + {"inode", FUSE_DAX_INODE_USER }, + {} +}; + enum { OPT_DAX, + OPT_DAX_ENUM, }; static const struct fs_parameter_spec virtio_fs_parameters[] = { fsparam_flag("dax", OPT_DAX), + fsparam_enum("dax", OPT_DAX_ENUM, dax_param_enums), {} }; @@ -110,7 +119,10 @@ static int virtio_fs_parse_param(struct fs_context *fsc, switch (opt) { case OPT_DAX: - ctx->dax = 1; + ctx->dax_mode = FUSE_DAX_ALWAYS; + break; + case OPT_DAX_ENUM: + ctx->dax_mode = result.uint_32; break; default: return -EINVAL; @@ -1326,8 +1338,8 @@ static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc) /* virtiofs allocates and installs its own fuse devices */ ctx->fudptr = NULL; - if (ctx->dax) { - if (!fs->dax_dev) { + if (ctx->dax_mode != FUSE_DAX_NEVER) { + if (ctx->dax_mode == FUSE_DAX_ALWAYS && !fs->dax_dev) { err = -EINVAL; pr_err("virtio-fs: dax can't be enabled as filesystem" " device does not support it.\n"); From 98046f7486db723ec8bb99a950a4fa5f5be55cd1 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 25 Nov 2021 15:05:26 +0800 Subject: [PATCH 06/10] fuse: support per inode DAX in fuse protocol Expand the fuse protocol to support per inode DAX. FUSE_HAS_INODE_DAX flag is added indicating if fuse server/client supporting per inode DAX. It can be conveyed in both FUSE_INIT request and reply. FUSE_ATTR_DAX flag is added indicating if DAX shall be enabled for corresponding file. It is conveyed in FUSE_LOOKUP reply. Signed-off-by: Jeffle Xu Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 3f0ea63fec08..d6ccee961891 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -193,6 +193,7 @@ * - add flags2 to fuse_init_in and fuse_init_out * - add FUSE_SECURITY_CTX init flag * - add security context to create, mkdir, symlink, and mknod requests + * - add FUSE_HAS_INODE_DAX, FUSE_ATTR_DAX */ #ifndef _LINUX_FUSE_H @@ -351,6 +352,7 @@ struct fuse_file_lock { * FUSE_INIT_RESERVED: reserved, do not use * FUSE_SECURITY_CTX: add security context to create, mkdir, symlink, and * mknod + * FUSE_HAS_INODE_DAX: use per inode DAX */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -386,6 +388,7 @@ struct fuse_file_lock { #define FUSE_INIT_RESERVED (1 << 31) /* bits 32..63 get shifted down 32 bits into the flags2 field */ #define FUSE_SECURITY_CTX (1ULL << 32) +#define FUSE_HAS_INODE_DAX (1ULL << 33) /** * CUSE INIT request/reply flags @@ -468,8 +471,10 @@ struct fuse_file_lock { * fuse_attr flags * * FUSE_ATTR_SUBMOUNT: Object is a submount root + * FUSE_ATTR_DAX: Enable DAX for this file in per inode DAX mode */ #define FUSE_ATTR_SUBMOUNT (1 << 0) +#define FUSE_ATTR_DAX (1 << 1) /** * Open flags From 93a497b9ad695bb2f38a302c5b29dbc9b555ff3f Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 25 Nov 2021 15:05:27 +0800 Subject: [PATCH 07/10] fuse: enable per inode DAX DAX may be limited in some specific situation. When the number of usable DAX windows is under watermark, the recalim routine will be triggered to reclaim some DAX windows. It may have a negative impact on the performance, since some processes may need to wait for DAX windows to be recalimed and reused then. To mitigate the performance degradation, the overall DAX window need to be expanded larger. However, simply expanding the DAX window may not be a good deal in some scenario. To maintain one DAX window chunk (i.e., 2MB in size), 32KB (512 * 64 bytes) memory footprint will be consumed for page descriptors inside guest, which is greater than the memory footprint if it uses guest page cache when DAX disabled. Thus it'd better disable DAX for those files smaller than 32KB, to reduce the demand for DAX window and thus avoid the unworthy memory overhead. Per inode DAX feature is introduced to address this issue, by offering a finer grained control for dax to users, trying to achieve a balance between performance and memory overhead. The FUSE_ATTR_DAX flag in FUSE_LOOKUP reply is used to indicate whether DAX should be enabled or not for corresponding file. Currently the state whether DAX is enabled or not for the file is initialized only when inode is instantiated. Signed-off-by: Jeffle Xu Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/dax.c | 12 ++++++++---- fs/fuse/file.c | 4 ++-- fs/fuse/fuse_i.h | 4 ++-- fs/fuse/inode.c | 2 +- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index d2bc5e7f5132..ff17bc7a0b6e 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1330,7 +1330,7 @@ static const struct address_space_operations fuse_dax_file_aops = { .invalidatepage = noop_invalidatepage, }; -static bool fuse_should_enable_dax(struct inode *inode) +static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags) { struct fuse_conn *fc = get_fuse_conn(inode); enum fuse_dax_mode dax_mode = fc->dax_mode; @@ -1345,12 +1345,16 @@ static bool fuse_should_enable_dax(struct inode *inode) if (!fc->dax) return false; - return true; + if (dax_mode == FUSE_DAX_ALWAYS) + return true; + + /* dax_mode is FUSE_DAX_INODE* */ + return flags & FUSE_ATTR_DAX; } -void fuse_dax_inode_init(struct inode *inode) +void fuse_dax_inode_init(struct inode *inode, unsigned int flags) { - if (!fuse_should_enable_dax(inode)) + if (!fuse_should_enable_dax(inode, flags)) return; inode->i_flags |= S_DAX; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index df81768c81a7..829094451774 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3169,7 +3169,7 @@ static const struct address_space_operations fuse_file_aops = { .write_end = fuse_write_end, }; -void fuse_init_file_inode(struct inode *inode) +void fuse_init_file_inode(struct inode *inode, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -3183,5 +3183,5 @@ void fuse_init_file_inode(struct inode *inode) fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) - fuse_dax_inode_init(inode); + fuse_dax_inode_init(inode, flags); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8bf133109170..ccbc4de7870b 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1025,7 +1025,7 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc, /** * Initialize file operations on a regular file */ -void fuse_init_file_inode(struct inode *inode); +void fuse_init_file_inode(struct inode *inode, unsigned int flags); /** * Initialize inode operations on regular files and special files @@ -1291,7 +1291,7 @@ int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode mode, struct dax_device *dax_dev); void fuse_dax_conn_free(struct fuse_conn *fc); bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi); -void fuse_dax_inode_init(struct inode *inode); +void fuse_dax_inode_init(struct inode *inode, unsigned int flags); void fuse_dax_inode_cleanup(struct inode *inode); bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); void fuse_dax_cancel_work(struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d83589ef11c0..dc6714350f21 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -313,7 +313,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) inode->i_ctime.tv_nsec = attr->ctimensec; if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); - fuse_init_file_inode(inode); + fuse_init_file_inode(inode, attr->flags); } else if (S_ISDIR(inode->i_mode)) fuse_init_dir(inode); else if (S_ISLNK(inode->i_mode)) From 2ee019fadcca343c3deea6a1767965bdf23fc3d0 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 25 Nov 2021 15:05:28 +0800 Subject: [PATCH 08/10] fuse: negotiate per inode DAX in FUSE_INIT Among the FUSE_INIT phase, client shall advertise per inode DAX if it's mounted with "dax=inode". Then server is aware that client is in per inode DAX mode, and will construct per-inode DAX attribute accordingly. Server shall also advertise support for per inode DAX. If server doesn't support it while client is mounted with "dax=inode", client will silently fallback to "dax=never" since "dax=inode" is advisory only. Signed-off-by: Jeffle Xu Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/dax.c | 2 +- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 13 +++++++++---- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index ff17bc7a0b6e..663270dc6e20 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1349,7 +1349,7 @@ static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags) return true; /* dax_mode is FUSE_DAX_INODE* */ - return flags & FUSE_ATTR_DAX; + return fc->inode_dax && (flags & FUSE_ATTR_DAX); } void fuse_dax_inode_init(struct inode *inode, unsigned int flags) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ccbc4de7870b..6d63bd403a43 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -780,6 +780,9 @@ struct fuse_conn { /* Initialize security xattrs when creating a new inode */ unsigned int init_security:1; + /* Does the filesystem support per inode DAX? */ + unsigned int inode_dax:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index dc6714350f21..c256fd82af3c 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1171,10 +1171,13 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); } - if (IS_ENABLED(CONFIG_FUSE_DAX) && - flags & FUSE_MAP_ALIGNMENT && - !fuse_dax_check_alignment(fc, arg->map_alignment)) { - ok = false; + if (IS_ENABLED(CONFIG_FUSE_DAX)) { + if (flags & FUSE_MAP_ALIGNMENT && + !fuse_dax_check_alignment(fc, arg->map_alignment)) { + ok = false; + } + if (flags & FUSE_HAS_INODE_DAX) + fc->inode_dax = 1; } if (flags & FUSE_HANDLE_KILLPRIV_V2) { fc->handle_killpriv_v2 = 1; @@ -1233,6 +1236,8 @@ void fuse_send_init(struct fuse_mount *fm) #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; + if (fuse_is_inode_dax_mode(fm->fc->dax_mode)) + flags |= FUSE_HAS_INODE_DAX; #endif if (fm->fc->auto_submounts) flags |= FUSE_SUBMOUNTS; From c3cb6f935e322fa183988032e318b293d9e4fe53 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 25 Nov 2021 15:05:29 +0800 Subject: [PATCH 09/10] fuse: mark inode DONT_CACHE when per inode DAX hint changes When the per inode DAX hint changes while the file is still *opened*, it is quite complicated and maybe fragile to dynamically change the DAX state. Hence mark the inode and corresponding dentries as DONE_CACHE once the per inode DAX hint changes, so that the inode instance will be evicted and freed as soon as possible once the file is closed and the last reference to the inode is put. And then when the file gets reopened next time, the new instantiated inode will reflect the new DAX state. In summary, when the per inode DAX hint changes for an *opened* file, the DAX state of the file won't be updated until this file is closed and reopened later. Signed-off-by: Jeffle Xu Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/dax.c | 9 +++++++++ fs/fuse/fuse_i.h | 1 + fs/fuse/inode.c | 3 +++ 3 files changed, 13 insertions(+) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 663270dc6e20..182b24a14804 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1361,6 +1361,15 @@ void fuse_dax_inode_init(struct inode *inode, unsigned int flags) inode->i_data.a_ops = &fuse_dax_file_aops; } +void fuse_dax_dontcache(struct inode *inode, unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + if (fuse_is_inode_dax_mode(fc->dax_mode) && + ((bool) IS_DAX(inode) != (bool) (flags & FUSE_ATTR_DAX))) + d_mark_dontcache(inode); +} + bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment) { if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 6d63bd403a43..e8e59fbdefeb 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1296,6 +1296,7 @@ void fuse_dax_conn_free(struct fuse_conn *fc); bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi); void fuse_dax_inode_init(struct inode *inode, unsigned int flags); void fuse_dax_inode_cleanup(struct inode *inode); +void fuse_dax_dontcache(struct inode *inode, unsigned int flags); bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); void fuse_dax_cancel_work(struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index c256fd82af3c..ee846ce371d8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -301,6 +301,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, if (inval) invalidate_inode_pages2(inode->i_mapping); } + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_dontcache(inode, attr->flags); } static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) From 073c3ab6ae0123601b5378e8f49c7b8ec4625f32 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 25 Nov 2021 15:05:30 +0800 Subject: [PATCH 10/10] Documentation/filesystem/dax: DAX on virtiofs Record DAX on virtiofs and the semantic difference with that on ext4 and xfs. Signed-off-by: Jeffle Xu Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- Documentation/filesystems/dax.rst | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/dax.rst b/Documentation/filesystems/dax.rst index 9a1b8fd9e82b..e3b30429d703 100644 --- a/Documentation/filesystems/dax.rst +++ b/Documentation/filesystems/dax.rst @@ -23,8 +23,8 @@ on it as usual. The `DAX` code currently only supports files with a block size equal to your kernel's `PAGE_SIZE`, so you may need to specify a block size when creating the filesystem. -Currently 3 filesystems support `DAX`: ext2, ext4 and xfs. Enabling `DAX` on them -is different. +Currently 4 filesystems support `DAX`: ext2, ext4, xfs and virtiofs. +Enabling `DAX` on them is different. Enabling DAX on ext2 -------------------- @@ -168,6 +168,22 @@ if the underlying media does not support dax and/or the filesystem is overridden with a mount option. +Enabling DAX on virtiofs +---------------------------- +The semantic of DAX on virtiofs is basically equal to that on ext4 and xfs, +except that when '-o dax=inode' is specified, virtiofs client derives the hint +whether DAX shall be enabled or not from virtiofs server through FUSE protocol, +rather than the persistent `FS_XFLAG_DAX` flag. That is, whether DAX shall be +enabled or not is completely determined by virtiofs server, while virtiofs +server itself may deploy various algorithm making this decision, e.g. depending +on the persistent `FS_XFLAG_DAX` flag on the host. + +It is still supported to set or clear persistent `FS_XFLAG_DAX` flag inside +guest, but it is not guaranteed that DAX will be enabled or disabled for +corresponding file then. Users inside guest still need to call statx(2) and +check the statx flag `STATX_ATTR_DAX` to see if DAX is enabled for this file. + + Implementation Tips for Block Driver Writers --------------------------------------------