diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst index f47d05ed0d94..4a25c5eb6f07 100644 --- a/Documentation/fault-injection/fault-injection.rst +++ b/Documentation/fault-injection/fault-injection.rst @@ -24,6 +24,10 @@ Available fault injection capabilities injects futex deadlock and uaddr fault errors. +- fail_sunrpc + + injects kernel RPC client and server failures. + - fail_make_request injects disk IO errors on devices permitted by setting @@ -151,6 +155,20 @@ configuration of fault-injection capabilities. default is 'N', setting it to 'Y' will disable failure injections when dealing with private (address space) futexes. +- /sys/kernel/debug/fail_sunrpc/ignore-client-disconnect: + + Format: { 'Y' | 'N' } + + default is 'N', setting it to 'Y' will disable disconnect + injection on the RPC client. + +- /sys/kernel/debug/fail_sunrpc/ignore-server-disconnect: + + Format: { 'Y' | 'N' } + + default is 'N', setting it to 'Y' will disable disconnect + injection on the RPC server. + - /sys/kernel/debug/fail_function/inject: Format: { 'function-name' | '!function-name' | '' } diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 2de048f80eb8..0ab9756ed235 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -584,7 +584,7 @@ static struct ctl_table nlm_sysctls[] = { .data = &nsm_use_hostnames, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dobool, }, { .procname = "nsm_local_state", diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4c10fb5138f1..e10ae2c41279 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -40,12 +40,15 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, /* Obtain file pointer. Not used by FREE_ALL call. */ if (filp != NULL) { - if ((error = nlm_lookup_file(rqstp, &file, &lock->fh)) != 0) + int mode = lock_to_openmode(&lock->fl); + + error = nlm_lookup_file(rqstp, &file, lock); + if (error) goto no_locks; *filp = file; /* Set up the missing parts of the file_lock structure */ - lock->fl.fl_file = file->f_file; + lock->fl.fl_file = file->f_file[mode]; lock->fl.fl_pid = current->tgid; lock->fl.fl_lmops = &nlmsvc_lock_operations; nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 61d3cc2283dc..e9b85d8fd5fe 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -31,6 +31,7 @@ #include #include #include +#include #define NLMDBG_FACILITY NLMDBG_SVCLOCK @@ -395,28 +396,10 @@ nlmsvc_release_lockowner(struct nlm_lock *lock) nlmsvc_put_lockowner(lock->fl.fl_owner); } -static void nlmsvc_locks_copy_lock(struct file_lock *new, struct file_lock *fl) -{ - struct nlm_lockowner *nlm_lo = (struct nlm_lockowner *)fl->fl_owner; - new->fl_owner = nlmsvc_get_lockowner(nlm_lo); -} - -static void nlmsvc_locks_release_private(struct file_lock *fl) -{ - nlmsvc_put_lockowner((struct nlm_lockowner *)fl->fl_owner); -} - -static const struct file_lock_operations nlmsvc_lock_ops = { - .fl_copy_lock = nlmsvc_locks_copy_lock, - .fl_release_private = nlmsvc_locks_release_private, -}; - void nlmsvc_locks_init_private(struct file_lock *fl, struct nlm_host *host, pid_t pid) { fl->fl_owner = nlmsvc_find_lockowner(host, pid); - if (fl->fl_owner != NULL) - fl->fl_ops = &nlmsvc_lock_ops; } /* @@ -488,17 +471,24 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_cookie *cookie, int reclaim) { struct nlm_block *block = NULL; + struct inode *inode = nlmsvc_file_inode(file); int error; + int mode; + int async_block = 0; __be32 ret; dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", - locks_inode(file->f_file)->i_sb->s_id, - locks_inode(file->f_file)->i_ino, + inode->i_sb->s_id, inode->i_ino, lock->fl.fl_type, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end, wait); + if (inode->i_sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS) { + async_block = wait; + wait = 0; + } + /* Lock file against concurrent access */ mutex_lock(&file->f_mutex); /* Get existing block (in case client is busy-waiting) @@ -542,7 +532,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, if (!wait) lock->fl.fl_flags &= ~FL_SLEEP; - error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + mode = lock_to_openmode(&lock->fl); + error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL); lock->fl.fl_flags &= ~FL_SLEEP; dprintk("lockd: vfs_lock_file returned %d\n", error); @@ -558,7 +549,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, */ if (wait) break; - ret = nlm_lck_denied; + ret = async_block ? nlm_lck_blocked : nlm_lck_denied; goto out; case FILE_LOCK_DEFERRED: if (wait) @@ -595,12 +586,13 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_lock *conflock, struct nlm_cookie *cookie) { int error; + int mode; __be32 ret; struct nlm_lockowner *test_owner; dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n", - locks_inode(file->f_file)->i_sb->s_id, - locks_inode(file->f_file)->i_ino, + nlmsvc_file_inode(file)->i_sb->s_id, + nlmsvc_file_inode(file)->i_ino, lock->fl.fl_type, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -613,7 +605,8 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, /* If there's a conflicting lock, remember to clean up the test lock */ test_owner = (struct nlm_lockowner *)lock->fl.fl_owner; - error = vfs_test_lock(file->f_file, &lock->fl); + mode = lock_to_openmode(&lock->fl); + error = vfs_test_lock(file->f_file[mode], &lock->fl); if (error) { /* We can't currently deal with deferred test requests */ if (error == FILE_LOCK_DEFERRED) @@ -634,7 +627,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, conflock->caller = "somehost"; /* FIXME */ conflock->len = strlen(conflock->caller); conflock->oh.len = 0; /* don't return OH info */ - conflock->svid = ((struct nlm_lockowner *)lock->fl.fl_owner)->pid; + conflock->svid = lock->fl.fl_pid; conflock->fl.fl_type = lock->fl.fl_type; conflock->fl.fl_start = lock->fl.fl_start; conflock->fl.fl_end = lock->fl.fl_end; @@ -659,11 +652,11 @@ out: __be32 nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) { - int error; + int error = 0; dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n", - locks_inode(file->f_file)->i_sb->s_id, - locks_inode(file->f_file)->i_ino, + nlmsvc_file_inode(file)->i_sb->s_id, + nlmsvc_file_inode(file)->i_ino, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -672,7 +665,12 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) nlmsvc_cancel_blocked(net, file, lock); lock->fl.fl_type = F_UNLCK; - error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + if (file->f_file[O_RDONLY]) + error = vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, + &lock->fl, NULL); + if (file->f_file[O_WRONLY]) + error = vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, + &lock->fl, NULL); return (error < 0)? nlm_lck_denied_nolocks : nlm_granted; } @@ -689,10 +687,11 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l { struct nlm_block *block; int status = 0; + int mode; dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", - locks_inode(file->f_file)->i_sb->s_id, - locks_inode(file->f_file)->i_ino, + nlmsvc_file_inode(file)->i_sb->s_id, + nlmsvc_file_inode(file)->i_ino, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -704,7 +703,8 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l block = nlmsvc_lookup_block(file, lock); mutex_unlock(&file->f_mutex); if (block != NULL) { - vfs_cancel_lock(block->b_file->f_file, + mode = lock_to_openmode(&lock->fl); + vfs_cancel_lock(block->b_file->f_file[mode], &block->b_call->a_args.lock.fl); status = nlmsvc_unlink_block(block); nlmsvc_release_block(block); @@ -788,9 +788,21 @@ nlmsvc_notify_blocked(struct file_lock *fl) printk(KERN_WARNING "lockd: notification for unknown block!\n"); } +static fl_owner_t nlmsvc_get_owner(fl_owner_t owner) +{ + return nlmsvc_get_lockowner(owner); +} + +static void nlmsvc_put_owner(fl_owner_t owner) +{ + nlmsvc_put_lockowner(owner); +} + const struct lock_manager_operations nlmsvc_lock_operations = { .lm_notify = nlmsvc_notify_blocked, .lm_grant = nlmsvc_grant_deferred, + .lm_get_owner = nlmsvc_get_owner, + .lm_put_owner = nlmsvc_put_owner, }; /* @@ -809,6 +821,7 @@ nlmsvc_grant_blocked(struct nlm_block *block) { struct nlm_file *file = block->b_file; struct nlm_lock *lock = &block->b_call->a_args.lock; + int mode; int error; loff_t fl_start, fl_end; @@ -834,7 +847,8 @@ nlmsvc_grant_blocked(struct nlm_block *block) lock->fl.fl_flags |= FL_SLEEP; fl_start = lock->fl.fl_start; fl_end = lock->fl.fl_end; - error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + mode = lock_to_openmode(&lock->fl); + error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL); lock->fl.fl_flags &= ~FL_SLEEP; lock->fl.fl_start = fl_start; lock->fl.fl_end = fl_end; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 4ae4b63b5392..99696d3f6dd6 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -55,6 +55,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, struct nlm_host *host = NULL; struct nlm_file *file = NULL; struct nlm_lock *lock = &argp->lock; + int mode; __be32 error = 0; /* nfsd callbacks must have been installed for this procedure */ @@ -69,13 +70,14 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, /* Obtain file pointer. Not used by FREE_ALL call. */ if (filp != NULL) { - error = cast_status(nlm_lookup_file(rqstp, &file, &lock->fh)); + error = cast_status(nlm_lookup_file(rqstp, &file, lock)); if (error != 0) goto no_locks; *filp = file; /* Set up the missing parts of the file_lock structure */ - lock->fl.fl_file = file->f_file; + mode = lock_to_openmode(&lock->fl); + lock->fl.fl_file = file->f_file[mode]; lock->fl.fl_pid = current->tgid; lock->fl.fl_lmops = &nlmsvc_lock_operations; nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 028fc152da22..cb3a7512c33e 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -45,7 +45,7 @@ static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f) static inline void nlm_debug_print_file(char *msg, struct nlm_file *file) { - struct inode *inode = locks_inode(file->f_file); + struct inode *inode = nlmsvc_file_inode(file); dprintk("lockd: %s %s/%ld\n", msg, inode->i_sb->s_id, inode->i_ino); @@ -71,56 +71,75 @@ static inline unsigned int file_hash(struct nfs_fh *f) return tmp & (FILE_NRHASH - 1); } +int lock_to_openmode(struct file_lock *lock) +{ + return (lock->fl_type == F_WRLCK) ? O_WRONLY : O_RDONLY; +} + +/* + * Open the file. Note that if we're reexporting, for example, + * this could block the lockd thread for a while. + * + * We have to make sure we have the right credential to open + * the file. + */ +static __be32 nlm_do_fopen(struct svc_rqst *rqstp, + struct nlm_file *file, int mode) +{ + struct file **fp = &file->f_file[mode]; + __be32 nfserr; + + if (*fp) + return 0; + nfserr = nlmsvc_ops->fopen(rqstp, &file->f_handle, fp, mode); + if (nfserr) + dprintk("lockd: open failed (error %d)\n", nfserr); + return nfserr; +} + /* * Lookup file info. If it doesn't exist, create a file info struct * and open a (VFS) file for the given inode. - * - * FIXME: - * Note that we open the file O_RDONLY even when creating write locks. - * This is not quite right, but for now, we assume the client performs - * the proper R/W checking. */ __be32 nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, - struct nfs_fh *f) + struct nlm_lock *lock) { struct nlm_file *file; unsigned int hash; __be32 nfserr; + int mode; - nlm_debug_print_fh("nlm_lookup_file", f); + nlm_debug_print_fh("nlm_lookup_file", &lock->fh); - hash = file_hash(f); + hash = file_hash(&lock->fh); + mode = lock_to_openmode(&lock->fl); /* Lock file table */ mutex_lock(&nlm_file_mutex); hlist_for_each_entry(file, &nlm_files[hash], f_list) - if (!nfs_compare_fh(&file->f_handle, f)) + if (!nfs_compare_fh(&file->f_handle, &lock->fh)) { + mutex_lock(&file->f_mutex); + nfserr = nlm_do_fopen(rqstp, file, mode); + mutex_unlock(&file->f_mutex); goto found; - - nlm_debug_print_fh("creating file for", f); + } + nlm_debug_print_fh("creating file for", &lock->fh); nfserr = nlm_lck_denied_nolocks; file = kzalloc(sizeof(*file), GFP_KERNEL); if (!file) - goto out_unlock; + goto out_free; - memcpy(&file->f_handle, f, sizeof(struct nfs_fh)); + memcpy(&file->f_handle, &lock->fh, sizeof(struct nfs_fh)); mutex_init(&file->f_mutex); INIT_HLIST_NODE(&file->f_list); INIT_LIST_HEAD(&file->f_blocks); - /* Open the file. Note that this must not sleep for too long, else - * we would lock up lockd:-) So no NFS re-exports, folks. - * - * We have to make sure we have the right credential to open - * the file. - */ - if ((nfserr = nlmsvc_ops->fopen(rqstp, f, &file->f_file)) != 0) { - dprintk("lockd: open failed (error %d)\n", nfserr); - goto out_free; - } + nfserr = nlm_do_fopen(rqstp, file, mode); + if (nfserr) + goto out_unlock; hlist_add_head(&file->f_list, &nlm_files[hash]); @@ -128,7 +147,6 @@ found: dprintk("lockd: found file %p (count %d)\n", file, file->f_count); *result = file; file->f_count++; - nfserr = 0; out_unlock: mutex_unlock(&nlm_file_mutex); @@ -148,13 +166,34 @@ nlm_delete_file(struct nlm_file *file) nlm_debug_print_file("closing file", file); if (!hlist_unhashed(&file->f_list)) { hlist_del(&file->f_list); - nlmsvc_ops->fclose(file->f_file); + if (file->f_file[O_RDONLY]) + nlmsvc_ops->fclose(file->f_file[O_RDONLY]); + if (file->f_file[O_WRONLY]) + nlmsvc_ops->fclose(file->f_file[O_WRONLY]); kfree(file); } else { printk(KERN_WARNING "lockd: attempt to release unknown file!\n"); } } +static int nlm_unlock_files(struct nlm_file *file) +{ + struct file_lock lock; + struct file *f; + + lock.fl_type = F_UNLCK; + lock.fl_start = 0; + lock.fl_end = OFFSET_MAX; + for (f = file->f_file[0]; f <= file->f_file[1]; f++) { + if (f && vfs_lock_file(f, F_SETLK, &lock, NULL) < 0) { + pr_warn("lockd: unlock failure in %s:%d\n", + __FILE__, __LINE__); + return 1; + } + } + return 0; +} + /* * Loop over all locks on the given file and perform the specified * action. @@ -182,17 +221,10 @@ again: lockhost = ((struct nlm_lockowner *)fl->fl_owner)->host; if (match(lockhost, host)) { - struct file_lock lock = *fl; spin_unlock(&flctx->flc_lock); - lock.fl_type = F_UNLCK; - lock.fl_start = 0; - lock.fl_end = OFFSET_MAX; - if (vfs_lock_file(file->f_file, F_SETLK, &lock, NULL) < 0) { - printk("lockd: unlock failure in %s:%d\n", - __FILE__, __LINE__); + if (nlm_unlock_files(file)) return 1; - } goto again; } } @@ -246,6 +278,15 @@ nlm_file_inuse(struct nlm_file *file) return 0; } +static void nlm_close_files(struct nlm_file *file) +{ + struct file *f; + + for (f = file->f_file[0]; f <= file->f_file[1]; f++) + if (f) + nlmsvc_ops->fclose(f); +} + /* * Loop over all files in the file table. */ @@ -276,7 +317,7 @@ nlm_traverse_files(void *data, nlm_host_match_fn_t match, if (list_empty(&file->f_blocks) && !file->f_locks && !file->f_shares && !file->f_count) { hlist_del(&file->f_list); - nlmsvc_ops->fclose(file->f_file); + nlm_close_files(file); kfree(file); } } @@ -410,12 +451,13 @@ nlmsvc_invalidate_all(void) nlm_traverse_files(NULL, nlmsvc_is_client, NULL); } + static int nlmsvc_match_sb(void *datap, struct nlm_file *file) { struct super_block *sb = datap; - return sb == locks_inode(file->f_file)->i_sb; + return sb == nlmsvc_file_inode(file)->i_sb; } /** diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 37a1a88df771..d772c20bbfd1 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -180,5 +180,5 @@ const struct export_operations nfs_export_ops = { .fetch_iversion = nfs_fetch_iversion, .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| - EXPORT_OP_NOATOMIC_ATTR, + EXPORT_OP_NOATOMIC_ATTR|EXPORT_OP_SYNC_LOCKS, }; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 514be5d28d70..aa353fd58240 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -806,6 +806,9 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) nfs_inc_stats(inode, NFSIOS_VFSLOCK); + if (fl->fl_flags & FL_RECLAIM) + return -ENOGRACE; + if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL) is_local = 1; diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 3f5b3d7b62b7..606fa155c28a 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -25,9 +25,11 @@ * Note: we hold the dentry use count while the file is open. */ static __be32 -nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) +nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, + int mode) { __be32 nfserr; + int access; struct svc_fh fh; /* must initialize before using! but maxsize doesn't matter */ @@ -36,7 +38,9 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size); fh.fh_export = NULL; - nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp); + access = (mode == O_WRONLY) ? NFSD_MAY_WRITE : NFSD_MAY_READ; + access |= NFSD_MAY_LOCK; + nfserr = nfsd_open(rqstp, &fh, S_IFREG, access, filp); fh_put(&fh); /* We return nlm error codes as nlm doesn't know * about nfsd, but nfsd does know about nlm.. diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8313e1dbb5dc..42356416f0a0 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2687,9 +2687,9 @@ static void force_expire_client(struct nfs4_client *clp) trace_nfsd_clid_admin_expired(&clp->cl_clientid); - spin_lock(&clp->cl_lock); + spin_lock(&nn->client_lock); clp->cl_time = 0; - spin_unlock(&clp->cl_lock); + spin_unlock(&nn->client_lock); wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0); spin_lock(&nn->client_lock); @@ -6821,6 +6821,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; + struct super_block *sb; __be32 status = 0; int lkflg; int err; @@ -6842,6 +6843,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_lock: permission denied!\n"); return status; } + sb = cstate->current_fh.fh_dentry->d_sb; if (lock->lk_is_new) { if (nfsd4_has_session(cstate)) @@ -6887,10 +6889,14 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!locks_in_grace(net) && lock->lk_reclaim) goto out; + if (lock->lk_reclaim) + fl_flags |= FL_RECLAIM; + fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: - if (nfsd4_has_session(cstate)) + if (nfsd4_has_session(cstate) && + !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_READ_LT: @@ -6902,7 +6908,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fl_type = F_RDLCK; break; case NFS4_WRITEW_LT: - if (nfsd4_has_session(cstate)) + if (nfsd4_has_session(cstate) && + !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_WRITE_LT: @@ -7022,8 +7029,7 @@ out: /* * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN, * so we do a temporary open here just to get an open file to pass to - * vfs_test_lock. (Arguably perhaps test_lock should be done with an - * inode operation.) + * vfs_test_lock. */ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) { @@ -7038,7 +7044,9 @@ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct NFSD_MAY_READ)); if (err) goto out; + lock->fl_file = nf->nf_file; err = nfserrno(vfs_test_lock(nf->nf_file, lock)); + lock->fl_file = NULL; out: fh_unlock(fhp); nfsd_file_put(nf); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 60d7c59e7935..90fcd6178823 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -881,6 +881,7 @@ nfserrno (int errno) { nfserr_serverfault, -ENFILE }, { nfserr_io, -EUCLEAN }, { nfserr_perm, -ENOKEY }, + { nfserr_no_grace, -ENOGRACE}, }; int i; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index adaec43548d1..538520957a81 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -400,18 +400,16 @@ TRACE_EVENT(nfsd_dirent, TP_STRUCT__entry( __field(u32, fh_hash) __field(u64, ino) - __field(int, len) - __dynamic_array(unsigned char, name, namlen) + __string_len(name, name, namlen) ), TP_fast_assign( __entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0; __entry->ino = ino; - __entry->len = namlen; - memcpy(__get_str(name), name, namlen); + __assign_str_len(name, name, namlen) ), - TP_printk("fh_hash=0x%08x ino=%llu name=%.*s", - __entry->fh_hash, __entry->ino, - __entry->len, __get_str(name)) + TP_printk("fh_hash=0x%08x ino=%llu name=%s", + __entry->fh_hash, __entry->ino, __get_str(name) + ) ) #include "state.h" @@ -608,7 +606,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class, __array(unsigned char, addr, sizeof(struct sockaddr_in6)) __field(unsigned long, flavor) __array(unsigned char, verifier, NFS4_VERIFIER_SIZE) - __dynamic_array(char, name, clp->cl_name.len + 1) + __string_len(name, name, clp->cl_name.len) ), TP_fast_assign( __entry->cl_boot = clp->cl_clientid.cl_boot; @@ -618,8 +616,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class, __entry->flavor = clp->cl_cred.cr_flavor; memcpy(__entry->verifier, (void *)&clp->cl_verifier, NFS4_VERIFIER_SIZE); - memcpy(__get_str(name), clp->cl_name.data, clp->cl_name.len); - __get_str(name)[clp->cl_name.len] = '\0'; + __assign_str_len(name, clp->cl_name.data, clp->cl_name.len); ), TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x", __entry->addr, __get_str(name), diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 92e77f92268a..738d564ca4ce 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -244,7 +244,6 @@ out_nfserr: * returned. Otherwise the covered directory is returned. * NOTE: this mountpoint crossing is not supported properly by all * clients and is explicitly disallowed for NFSv3 - * NeilBrown */ __be32 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, @@ -826,26 +825,16 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct svc_rqst *rqstp = sd->u.data; struct page **pp = rqstp->rq_next_page; struct page *page = buf->page; - size_t size; - - size = sd->len; if (rqstp->rq_res.page_len == 0) { - get_page(page); - put_page(*rqstp->rq_next_page); - *(rqstp->rq_next_page++) = page; + svc_rqst_replace_page(rqstp, page); rqstp->rq_res.page_base = buf->offset; - rqstp->rq_res.page_len = size; } else if (page != pp[-1]) { - get_page(page); - if (*rqstp->rq_next_page) - put_page(*rqstp->rq_next_page); - *(rqstp->rq_next_page++) = page; - rqstp->rq_res.page_len += size; - } else - rqstp->rq_res.page_len += size; + svc_rqst_replace_page(rqstp, page); + } + rqstp->rq_res.page_len += sd->len; - return size; + return sd->len; } static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, diff --git a/include/linux/errno.h b/include/linux/errno.h index d73f597a2484..8b0c754bab02 100644 --- a/include/linux/errno.h +++ b/include/linux/errno.h @@ -31,5 +31,6 @@ #define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ #define EIOCBQUEUED 529 /* iocb queued, will get completion event */ #define ERECALLCONFLICT 530 /* conflict with recalled state */ +#define ENOGRACE 531 /* NFS file lock reclaim refused */ #endif diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index fe848901fcc3..3260fe714846 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -221,6 +221,8 @@ struct export_operations { #define EXPORT_OP_NOATOMIC_ATTR (0x10) /* Filesystem cannot supply atomic attribute updates */ +#define EXPORT_OP_SYNC_LOCKS (0x20) /* Filesystem can't do + asychronous blocking locks */ unsigned long flags; }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 2c2bcfb12fef..1c01f9f2b574 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1037,6 +1037,7 @@ static inline struct file *get_file(struct file *f) #define FL_UNLOCK_PENDING 512 /* Lease is being broken */ #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ #define FL_LAYOUT 2048 /* outstanding pNFS layout */ +#define FL_RECLAIM 4096 /* reclaiming from a reboot server */ #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 0520c0cd73f4..3bc9f7410e21 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -27,7 +27,8 @@ struct rpc_task; struct nlmsvc_binding { __be32 (*fopen)(struct svc_rqst *, struct nfs_fh *, - struct file **); + struct file **, + int mode); void (*fclose)(struct file *); }; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 666f5f310a04..c4ae6506b8b3 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -10,6 +10,8 @@ #ifndef LINUX_LOCKD_LOCKD_H #define LINUX_LOCKD_LOCKD_H +/* XXX: a lot of this should really be under fs/lockd. */ + #include #include #include @@ -154,7 +156,8 @@ struct nlm_rqst { struct nlm_file { struct hlist_node f_list; /* linked list */ struct nfs_fh f_handle; /* NFS file handle */ - struct file * f_file; /* VFS file pointer */ + struct file * f_file[2]; /* VFS file pointers, + indexed by O_ flags */ struct nlm_share * f_shares; /* DOS shares */ struct list_head f_blocks; /* blocked locks */ unsigned int f_locks; /* guesstimate # of locks */ @@ -267,6 +270,7 @@ typedef int (*nlm_host_match_fn_t)(void *cur, struct nlm_host *ref); /* * Server-side lock handling */ +int lock_to_openmode(struct file_lock *); __be32 nlmsvc_lock(struct svc_rqst *, struct nlm_file *, struct nlm_host *, struct nlm_lock *, int, struct nlm_cookie *, int); @@ -286,7 +290,7 @@ void nlmsvc_locks_init_private(struct file_lock *, struct nlm_host *, pid_t); * File handling for the server personality */ __be32 nlm_lookup_file(struct svc_rqst *, struct nlm_file **, - struct nfs_fh *); + struct nlm_lock *); void nlm_release_file(struct nlm_file *); void nlmsvc_release_lockowner(struct nlm_lock *); void nlmsvc_mark_resources(struct net *); @@ -301,7 +305,8 @@ int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) { - return locks_inode(file->f_file); + return locks_inode(file->f_file[O_RDONLY] ? + file->f_file[O_RDONLY] : file->f_file[O_WRONLY]); } static inline int __nlm_privileged_request4(const struct sockaddr *sap) diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h index 938c2bf29db8..02117ed0fa2e 100644 --- a/include/linux/sunrpc/msg_prot.h +++ b/include/linux/sunrpc/msg_prot.h @@ -20,6 +20,7 @@ enum rpc_auth_flavors { RPC_AUTH_DES = 3, RPC_AUTH_KRB = 4, RPC_AUTH_GSS = 6, + RPC_AUTH_TLS = 7, RPC_AUTH_MAXFLAVOR = 8, /* pseudoflavors: */ RPC_AUTH_GSS_KRB5 = 390003, diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index e91d51ea028b..f0f846fa396e 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -19,6 +19,7 @@ #include #include #include +#include /* statistics for svc_pool structures */ struct svc_pool_stats { @@ -256,6 +257,7 @@ struct svc_rqst { struct page * *rq_next_page; /* next reply page to use */ struct page * *rq_page_end; /* one past the last page */ + struct pagevec rq_pvec; struct kvec rq_vec[RPCSVC_MAXPAGES]; /* generally useful.. */ struct bio_vec rq_bvec[RPCSVC_MAXPAGES]; @@ -502,6 +504,8 @@ struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node); struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node); +void svc_rqst_replace_page(struct svc_rqst *rqstp, + struct page *page); void svc_rqst_free(struct svc_rqst *); void svc_exit_thread(struct svc_rqst *); unsigned int svc_pool_map_get(void); @@ -523,6 +527,7 @@ void svc_wake_up(struct svc_serv *); void svc_reserve(struct svc_rqst *rqstp, int space); struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu); char * svc_print_addr(struct svc_rqst *, char *, size_t); +const char * svc_proc_name(const struct svc_rqst *rqstp); int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 3184465de3a0..24aa159d29a7 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -90,9 +90,9 @@ struct svcxprt_rdma { struct ib_pd *sc_pd; spinlock_t sc_send_lock; - struct list_head sc_send_ctxts; + struct llist_head sc_send_ctxts; spinlock_t sc_rw_ctxt_lock; - struct list_head sc_rw_ctxts; + struct llist_head sc_rw_ctxts; u32 sc_pending_recvs; u32 sc_recv_batch; @@ -150,7 +150,7 @@ struct svc_rdma_recv_ctxt { }; struct svc_rdma_send_ctxt { - struct list_head sc_list; + struct llist_node sc_node; struct rpc_rdma_cid sc_cid; struct ib_send_wr sc_send_wr; @@ -207,6 +207,7 @@ extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, struct svc_rdma_recv_ctxt *rctxt, int status); +extern void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail); extern int svc_rdma_sendto(struct svc_rqst *); extern int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index a965cbc136ad..b519609af1d0 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -95,6 +95,7 @@ xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) #define rpc_auth_unix cpu_to_be32(RPC_AUTH_UNIX) #define rpc_auth_short cpu_to_be32(RPC_AUTH_SHORT) #define rpc_auth_gss cpu_to_be32(RPC_AUTH_GSS) +#define rpc_auth_tls cpu_to_be32(RPC_AUTH_TLS) #define rpc_call cpu_to_be32(RPC_CALL) #define rpc_reply cpu_to_be32(RPC_REPLY) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index c8c39f22d3b1..b15c1f07162d 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -288,7 +288,6 @@ struct rpc_xprt { const char *address_strings[RPC_DISPLAY_MAX]; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct dentry *debugfs; /* debugfs directory */ - atomic_t inject_disconnect; #endif struct rcu_head rcu; const struct xprt_class *xprt_class; @@ -502,21 +501,4 @@ static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt) return test_and_set_bit(XPRT_BINDING, &xprt->state); } -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -extern unsigned int rpc_inject_disconnect; -static inline void xprt_inject_disconnect(struct rpc_xprt *xprt) -{ - if (!rpc_inject_disconnect) - return; - if (atomic_dec_return(&xprt->inject_disconnect)) - return; - atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect); - xprt->ops->inject_disconnect(xprt); -} -#else -static inline void xprt_inject_disconnect(struct rpc_xprt *xprt) -{ -} -#endif - #endif /* _LINUX_SUNRPC_XPRT_H */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index d99ca99837de..1fa2b69c6fc3 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -48,6 +48,8 @@ typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_dostring(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dobool(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); int proc_dointvec(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 861f199896c6..d323f5a049c8 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1642,7 +1642,7 @@ TRACE_EVENT(svc_process, __field(u32, vers) __field(u32, proc) __string(service, name) - __string(procedure, rqst->rq_procinfo->pc_name) + __string(procedure, svc_proc_name(rqst)) __string(addr, rqst->rq_xprt ? rqst->rq_xprt->xpt_remotebuf : "(null)") ), @@ -1652,7 +1652,7 @@ TRACE_EVENT(svc_process, __entry->vers = rqst->rq_vers; __entry->proc = rqst->rq_proc; __assign_str(service, name); - __assign_str(procedure, rqst->rq_procinfo->pc_name); + __assign_str(procedure, svc_proc_name(rqst)); __assign_str(addr, rqst->rq_xprt ? rqst->rq_xprt->xpt_remotebuf : "(null)"); ), @@ -1918,7 +1918,7 @@ TRACE_EVENT(svc_stats_latency, TP_STRUCT__entry( __field(u32, xid) __field(unsigned long, execute) - __string(procedure, rqst->rq_procinfo->pc_name) + __string(procedure, svc_proc_name(rqst)) __string(addr, rqst->rq_xprt->xpt_remotebuf) ), @@ -1926,7 +1926,7 @@ TRACE_EVENT(svc_stats_latency, __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->execute = ktime_to_us(ktime_sub(ktime_get(), rqst->rq_stime)); - __assign_str(procedure, rqst->rq_procinfo->pc_name); + __assign_str(procedure, svc_proc_name(rqst)); __assign_str(addr, rqst->rq_xprt->xpt_remotebuf); ), diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index acc17194c160..08810a463880 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -102,6 +102,9 @@ TRACE_MAKE_SYSTEM_STR(); #undef __string #define __string(item, src) __dynamic_array(char, item, -1) +#undef __string_len +#define __string_len(item, src, len) __dynamic_array(char, item, -1) + #undef __bitmask #define __bitmask(item, nr_bits) __dynamic_array(char, item, -1) @@ -197,6 +200,9 @@ TRACE_MAKE_SYSTEM_STR(); #undef __string #define __string(item, src) __dynamic_array(char, item, -1) +#undef __string_len +#define __string_len(item, src, len) __dynamic_array(char, item, -1) + #undef __bitmask #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) @@ -459,6 +465,9 @@ static struct trace_event_functions trace_event_type_funcs_##call = { \ #undef __string #define __string(item, src) __dynamic_array(char, item, -1) +#undef __string_len +#define __string_len(item, src, len) __dynamic_array(char, item, -1) + #undef __bitmask #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) @@ -507,6 +516,9 @@ static struct trace_event_fields trace_event_fields_##call[] = { \ #define __string(item, src) __dynamic_array(char, item, \ strlen((src) ? (const char *)(src) : "(null)") + 1) +#undef __string_len +#define __string_len(item, src, len) __dynamic_array(char, item, (len) + 1) + /* * __bitmask_size_in_bytes_raw is the number of bytes needed to hold * num_possible_cpus(). @@ -670,10 +682,20 @@ static inline notrace int trace_event_get_offsets_##call( \ #undef __string #define __string(item, src) __dynamic_array(char, item, -1) +#undef __string_len +#define __string_len(item, src, len) __dynamic_array(char, item, -1) + #undef __assign_str #define __assign_str(dst, src) \ strcpy(__get_str(dst), (src) ? (const char *)(src) : "(null)"); +#undef __assign_str_len +#define __assign_str_len(dst, src, len) \ + do { \ + memcpy(__get_str(dst), (src), (len)); \ + __get_str(dst)[len] = '\0'; \ + } while(0) + #undef __bitmask #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) diff --git a/include/uapi/linux/nfsd/nfsfh.h b/include/uapi/linux/nfsd/nfsfh.h index 427294dd56a1..e29e8accc4f4 100644 --- a/include/uapi/linux/nfsd/nfsfh.h +++ b/include/uapi/linux/nfsd/nfsfh.h @@ -33,7 +33,6 @@ struct nfs_fhbase_old { /* * This is the new flexible, extensible style NFSv2/v3/v4 file handle. - * by Neil Brown - March 2000 * * The file handle starts with a sequence of four-byte words. * The first word contains a version number (1) and three descriptor bytes diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 272f4a272f8c..25e49b4d8049 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -536,6 +536,21 @@ static void proc_put_char(void **buf, size_t *size, char c) } } +static int do_proc_dobool_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *(bool *)valp = *lvalp; + } else { + int val = *(bool *)valp; + + *lvalp = (unsigned long)val; + *negp = false; + } + return 0; +} + static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) @@ -798,6 +813,26 @@ static int do_proc_douintvec(struct ctl_table *table, int write, buffer, lenp, ppos, conv, data); } +/** + * proc_dobool - read/write a bool + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_dobool(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dobool_conv, NULL); +} + /** * proc_dointvec - read a vector of integers * @table: the sysctl table @@ -1630,6 +1665,12 @@ int proc_dostring(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dobool(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3425,6 +3466,7 @@ int __init sysctl_init(void) * No sense putting this after each symbol definition, twice, * exception granted :-) */ +EXPORT_SYMBOL(proc_dobool); EXPORT_SYMBOL(proc_dointvec); EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 73604bff3d2c..12b805dabbc9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1945,6 +1945,13 @@ config FAIL_MMC_REQUEST and to test how the mmc host driver handles retries from the block device. +config FAIL_SUNRPC + bool "Fault-injection capability for SunRPC" + depends on FAULT_INJECTION_DEBUG_FS && SUNRPC_DEBUG + help + Provide fault-injection capability for SunRPC and + its consumers. + config FAULT_INJECTION_STACKTRACE_FILTER bool "stacktrace filter for fault-injection capabilities" depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index a81be45f40d9..3d685fe328fa 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1980,7 +1980,7 @@ gss_svc_init_net(struct net *net) goto out2; return 0; out2: - destroy_use_gss_proxy_proc_entry(net); + rsi_cache_destroy_net(net); out1: rsc_cache_destroy_net(net); return rv; diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index 56029e3af6ff..827bf3a28178 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -8,14 +8,14 @@ #include #include #include + #include "netns.h" +#include "fail.h" static struct dentry *topdir; static struct dentry *rpc_clnt_dir; static struct dentry *rpc_xprt_dir; -unsigned int rpc_inject_disconnect; - static int tasks_show(struct seq_file *f, void *v) { @@ -235,8 +235,6 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt) /* make tasks file */ debugfs_create_file("info", S_IFREG | 0400, xprt->debugfs, xprt, &xprt_info_fops); - - atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect); } void @@ -246,56 +244,30 @@ rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt) xprt->debugfs = NULL; } -static int -fault_open(struct inode *inode, struct file *filp) -{ - filp->private_data = kmalloc(128, GFP_KERNEL); - if (!filp->private_data) - return -ENOMEM; - return 0; -} - -static int -fault_release(struct inode *inode, struct file *filp) -{ - kfree(filp->private_data); - return 0; -} - -static ssize_t -fault_disconnect_read(struct file *filp, char __user *user_buf, - size_t len, loff_t *offset) -{ - char *buffer = (char *)filp->private_data; - size_t size; - - size = sprintf(buffer, "%u\n", rpc_inject_disconnect); - return simple_read_from_buffer(user_buf, len, offset, buffer, size); -} - -static ssize_t -fault_disconnect_write(struct file *filp, const char __user *user_buf, - size_t len, loff_t *offset) -{ - char buffer[16]; - - if (len >= sizeof(buffer)) - len = sizeof(buffer) - 1; - if (copy_from_user(buffer, user_buf, len)) - return -EFAULT; - buffer[len] = '\0'; - if (kstrtouint(buffer, 10, &rpc_inject_disconnect)) - return -EINVAL; - return len; -} - -static const struct file_operations fault_disconnect_fops = { - .owner = THIS_MODULE, - .open = fault_open, - .read = fault_disconnect_read, - .write = fault_disconnect_write, - .release = fault_release, +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) +struct fail_sunrpc_attr fail_sunrpc = { + .attr = FAULT_ATTR_INITIALIZER, }; +EXPORT_SYMBOL_GPL(fail_sunrpc); + +static void fail_sunrpc_init(void) +{ + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_sunrpc", NULL, + &fail_sunrpc.attr); + + debugfs_create_bool("ignore-client-disconnect", S_IFREG | 0600, dir, + &fail_sunrpc.ignore_client_disconnect); + + debugfs_create_bool("ignore-server-disconnect", S_IFREG | 0600, dir, + &fail_sunrpc.ignore_server_disconnect); +} +#else +static void fail_sunrpc_init(void) +{ +} +#endif void __exit sunrpc_debugfs_exit(void) @@ -309,16 +281,11 @@ sunrpc_debugfs_exit(void) void __init sunrpc_debugfs_init(void) { - struct dentry *rpc_fault_dir; - topdir = debugfs_create_dir("sunrpc", NULL); rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir); rpc_xprt_dir = debugfs_create_dir("rpc_xprt", topdir); - rpc_fault_dir = debugfs_create_dir("inject_fault", topdir); - - debugfs_create_file("disconnect", S_IFREG | 0400, rpc_fault_dir, NULL, - &fault_disconnect_fops); + fail_sunrpc_init(); } diff --git a/net/sunrpc/fail.h b/net/sunrpc/fail.h new file mode 100644 index 000000000000..69dc30cc44b8 --- /dev/null +++ b/net/sunrpc/fail.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021, Oracle. All rights reserved. + */ + +#ifndef _NET_SUNRPC_FAIL_H_ +#define _NET_SUNRPC_FAIL_H_ + +#include + +#if IS_ENABLED(CONFIG_FAULT_INJECTION) + +struct fail_sunrpc_attr { + struct fault_attr attr; + + bool ignore_client_disconnect; + + bool ignore_server_disconnect; +}; + +extern struct fail_sunrpc_attr fail_sunrpc; + +#endif /* CONFIG_FAULT_INJECTION */ + +#endif /* _NET_SUNRPC_FAIL_H_ */ diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 0de918cb3d90..bfcbaf7b3822 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -31,6 +31,8 @@ #include +#include "fail.h" + #define RPCDBG_FACILITY RPCDBG_SVCDSP static void svc_unregister(const struct svc_serv *serv, struct net *net); @@ -838,6 +840,27 @@ svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrser } EXPORT_SYMBOL_GPL(svc_set_num_threads_sync); +/** + * svc_rqst_replace_page - Replace one page in rq_pages[] + * @rqstp: svc_rqst with pages to replace + * @page: replacement page + * + * When replacing a page in rq_pages, batch the release of the + * replaced pages to avoid hammering the page allocator. + */ +void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) +{ + if (*rqstp->rq_next_page) { + if (!pagevec_space(&rqstp->rq_pvec)) + __pagevec_release(&rqstp->rq_pvec); + pagevec_add(&rqstp->rq_pvec, *rqstp->rq_next_page); + } + + get_page(page); + *(rqstp->rq_next_page++) = page; +} +EXPORT_SYMBOL_GPL(svc_rqst_replace_page); + /* * Called from a server thread as it's exiting. Caller must hold the "service * mutex" for the service. @@ -1503,6 +1526,12 @@ svc_process(struct svc_rqst *rqstp) struct svc_serv *serv = rqstp->rq_server; u32 dir; +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) + if (!fail_sunrpc.ignore_server_disconnect && + should_fail(&fail_sunrpc.attr, 1)) + svc_xprt_deferred_close(rqstp->rq_xprt); +#endif + /* * Setup response xdr_buf. * Initially it has just one page @@ -1629,6 +1658,21 @@ u32 svc_max_payload(const struct svc_rqst *rqstp) } EXPORT_SYMBOL_GPL(svc_max_payload); +/** + * svc_proc_name - Return RPC procedure name in string form + * @rqstp: svc_rqst to operate on + * + * Return value: + * Pointer to a NUL-terminated string + */ +const char *svc_proc_name(const struct svc_rqst *rqstp) +{ + if (rqstp && rqstp->rq_procinfo) + return rqstp->rq_procinfo->pc_name; + return "unknown"; +} + + /** * svc_encode_result_payload - mark a range of bytes as a result payload * @rqstp: svc_rqst to operate on diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index dbb41821b1b8..e1153cba9cc6 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -539,6 +539,7 @@ static void svc_xprt_release(struct svc_rqst *rqstp) kfree(rqstp->rq_deferred); rqstp->rq_deferred = NULL; + pagevec_release(&rqstp->rq_pvec); svc_free_res_pages(rqstp); rqstp->rq_res.page_len = 0; rqstp->rq_res.page_base = 0; @@ -664,6 +665,8 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) struct xdr_buf *arg = &rqstp->rq_arg; unsigned long pages, filled; + pagevec_init(&rqstp->rq_pvec); + pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT; if (pages > RPCSVC_MAXPAGES) { pr_warn_once("svc: warning: pages=%lu > RPCSVC_MAXPAGES=%lu\n", diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index fb6db09725c7..05abe344a269 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -56,6 +56,7 @@ #include "sunrpc.h" #include "sysfs.h" +#include "fail.h" /* * Local variables @@ -855,6 +856,19 @@ xprt_init_autodisconnect(struct timer_list *t) queue_work(xprtiod_workqueue, &xprt->task_cleanup); } +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) +static void xprt_inject_disconnect(struct rpc_xprt *xprt) +{ + if (!fail_sunrpc.ignore_client_disconnect && + should_fail(&fail_sunrpc.attr, 1)) + xprt->ops->inject_disconnect(xprt); +} +#else +static inline void xprt_inject_disconnect(struct rpc_xprt *xprt) +{ +} +#endif + bool xprt_lock_connect(struct rpc_xprt *xprt, struct rpc_task *task, void *cookie) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 1e651447dc4e..e27433f08ca7 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -35,6 +35,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); * controlling svcxprt_rdma is destroyed. */ struct svc_rdma_rw_ctxt { + struct llist_node rw_node; struct list_head rw_list; struct rdma_rw_ctx rw_ctx; unsigned int rw_nents; @@ -53,19 +54,19 @@ static struct svc_rdma_rw_ctxt * svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) { struct svc_rdma_rw_ctxt *ctxt; + struct llist_node *node; spin_lock(&rdma->sc_rw_ctxt_lock); - - ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts); - if (ctxt) { - list_del(&ctxt->rw_list); - spin_unlock(&rdma->sc_rw_ctxt_lock); + node = llist_del_first(&rdma->sc_rw_ctxts); + spin_unlock(&rdma->sc_rw_ctxt_lock); + if (node) { + ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); } else { - spin_unlock(&rdma->sc_rw_ctxt_lock); ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), GFP_KERNEL); if (!ctxt) goto out_noctx; + INIT_LIST_HEAD(&ctxt->rw_list); } @@ -83,14 +84,18 @@ out_noctx: return NULL; } +static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt, + struct llist_head *list) +{ + sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE); + llist_add(&ctxt->rw_node, list); +} + static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, struct svc_rdma_rw_ctxt *ctxt) { - sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE); - - spin_lock(&rdma->sc_rw_ctxt_lock); - list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts); - spin_unlock(&rdma->sc_rw_ctxt_lock); + __svc_rdma_put_rw_ctxt(rdma, ctxt, &rdma->sc_rw_ctxts); } /** @@ -101,9 +106,10 @@ static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) { struct svc_rdma_rw_ctxt *ctxt; + struct llist_node *node; - while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) { - list_del(&ctxt->rw_list); + while ((node = llist_del_first(&rdma->sc_rw_ctxts)) != NULL) { + ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); kfree(ctxt); } } @@ -171,20 +177,35 @@ static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, cc->cc_sqecount = 0; } +/* + * The consumed rw_ctx's are cleaned and placed on a local llist so + * that only one atomic llist operation is needed to put them all + * back on the free list. + */ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir) { struct svcxprt_rdma *rdma = cc->cc_rdma; + struct llist_node *first, *last; struct svc_rdma_rw_ctxt *ctxt; + LLIST_HEAD(free); + first = last = NULL; while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { list_del(&ctxt->rw_list); rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, ctxt->rw_sg_table.sgl, ctxt->rw_nents, dir); - svc_rdma_put_rw_ctxt(rdma, ctxt); + __svc_rdma_put_rw_ctxt(rdma, ctxt, &free); + + ctxt->rw_node.next = first; + first = &ctxt->rw_node; + if (!last) + last = first; } + if (first) + llist_add_batch(first, last, &rdma->sc_rw_ctxts); } /* State for sending a Write or Reply chunk. @@ -248,8 +269,7 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_write(wc, &cc->cc_cid); - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); if (unlikely(wc->status != IB_WC_SUCCESS)) svc_xprt_deferred_close(&rdma->sc_xprt); @@ -304,9 +324,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_read(wc, &cc->cc_cid); - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); - + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); cc->cc_status = wc->status; complete(&cc->cc_done); return; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index d6bbafb773e1..599021b2391d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -113,13 +113,6 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); -static inline struct svc_rdma_send_ctxt * -svc_rdma_next_send_ctxt(struct list_head *list) -{ - return list_first_entry_or_null(list, struct svc_rdma_send_ctxt, - sc_list); -} - static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma, struct rpc_rdma_cid *cid) { @@ -182,9 +175,10 @@ fail0: void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) { struct svc_rdma_send_ctxt *ctxt; + struct llist_node *node; - while ((ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts))) { - list_del(&ctxt->sc_list); + while ((node = llist_del_first(&rdma->sc_send_ctxts)) != NULL) { + ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); ib_dma_unmap_single(rdma->sc_pd->device, ctxt->sc_sges[0].addr, rdma->sc_max_req_size, @@ -204,12 +198,13 @@ void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma) { struct svc_rdma_send_ctxt *ctxt; + struct llist_node *node; spin_lock(&rdma->sc_send_lock); - ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts); - if (!ctxt) + node = llist_del_first(&rdma->sc_send_ctxts); + if (!node) goto out_empty; - list_del(&ctxt->sc_list); + ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); spin_unlock(&rdma->sc_send_lock); out: @@ -253,9 +248,21 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, ctxt->sc_sges[i].length); } - spin_lock(&rdma->sc_send_lock); - list_add(&ctxt->sc_list, &rdma->sc_send_ctxts); - spin_unlock(&rdma->sc_send_lock); + llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts); +} + +/** + * svc_rdma_wake_send_waiters - manage Send Queue accounting + * @rdma: controlling transport + * @avail: Number of additional SQEs that are now available + * + */ +void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail) +{ + atomic_add(avail, &rdma->sc_sq_avail); + smp_mb__after_atomic(); + if (unlikely(waitqueue_active(&rdma->sc_send_wait))) + wake_up(&rdma->sc_send_wait); } /** @@ -275,11 +282,9 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_send(wc, &ctxt->sc_cid); + svc_rdma_wake_send_waiters(rdma, 1); complete(&ctxt->sc_done); - atomic_inc(&rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); - if (unlikely(wc->status != IB_WC_SUCCESS)) svc_xprt_deferred_close(&rdma->sc_xprt); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index d94b7759ada1..94b20fb47135 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -136,9 +136,9 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); - INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts); + init_llist_head(&cma_xprt->sc_send_ctxts); init_llist_head(&cma_xprt->sc_recv_ctxts); - INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); + init_llist_head(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); @@ -545,7 +545,6 @@ static void __svc_rdma_free(struct work_struct *work) { struct svcxprt_rdma *rdma = container_of(work, struct svcxprt_rdma, sc_work); - struct svc_xprt *xprt = &rdma->sc_xprt; /* This blocks until the Completion Queues are empty */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) @@ -553,12 +552,6 @@ static void __svc_rdma_free(struct work_struct *work) svc_rdma_flush_recv_queues(rdma); - /* Final put of backchannel client transport */ - if (xprt->xpt_bc_xprt) { - xprt_put(xprt->xpt_bc_xprt); - xprt->xpt_bc_xprt = NULL; - } - svc_rdma_destroy_rw_ctxts(rdma); svc_rdma_send_ctxts_destroy(rdma); svc_rdma_recv_ctxts_destroy(rdma); diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 13a35f7cbe66..e61471ab7d14 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -141,6 +141,33 @@ * In most cases, the __assign_str() macro will take the same * parameters as the __string() macro had to declare the string. * + * __string_len: This is a helper to a __dynamic_array, but it understands + * that the array has characters in it, and with the combined + * use of __assign_str_len(), it will allocate 'len' + 1 bytes + * in the ring buffer and add a '\0' to the string. This is + * useful if the string being saved has no terminating '\0' byte. + * It requires that the length of the string is known as it acts + * like a memcpy(). + * + * Declared with: + * + * __string_len(foo, bar, len) + * + * To assign this string, use the helper macro __assign_str_len(). + * + * __assign_str(foo, bar, len); + * + * Then len + 1 is allocated to the ring buffer, and a nul terminating + * byte is added. This is similar to: + * + * memcpy(__get_str(foo), bar, len); + * __get_str(foo)[len] = 0; + * + * The advantage of using this over __dynamic_array, is that it + * takes care of allocating the extra byte on the ring buffer + * for the '\0' terminating byte, and __get_str(foo) can be used + * in the TP_printk(). + * * __bitmask: This is another kind of __dynamic_array, but it expects * an array of longs, and the number of bits to parse. It takes * two parameters (name, nr_bits), where name is the name of the