From 3c9101a0575f6078191dae6bb91a834c6aac3557 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Wed, 20 Oct 2010 00:17:53 -0400 Subject: [PATCH 01/15] NFSD: remove duplicate NFS4_STATEID_SIZE Already accepted by Bruce Signed-off-by: Andy Adamson Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfsd/nfs4callback.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 988cbb3a19b6..014482c4e57d 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -41,7 +41,6 @@ #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 -#define NFS4_STATEID_SIZE 16 /* Index of predefined Linux callback client operations */ From 35b61e63323ccf5fdcdd74b11751b58392c9cce1 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 20 Oct 2010 00:17:54 -0400 Subject: [PATCH 02/15] SUNRPC: define xdr_decode_opaque_fixed A helper for decoding a fixed length opaque value. Returns a pointer to the next item in the xdr stream. Signed-off-by: Benny Halevy Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index ab91d86565fd..498ab93a81e4 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -132,6 +132,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) return p + 2; } +static inline __be32 * +xdr_decode_opaque_fixed(__be32 *p, void *ptr, unsigned int len) +{ + memcpy(ptr, p, len); + return p + XDR_QUADLEN(len); +} + /* * Adjust kvec to reflect end of xdr'ed data (RPC client XDR) */ From c772567d97fa0fca454eea68aeae915ca1bc732b Mon Sep 17 00:00:00 2001 From: Dean Hildebrand Date: Wed, 20 Oct 2010 00:17:55 -0400 Subject: [PATCH 03/15] NFSv4.1: pnfsd, pnfs: protocol level pnfs constants Use only layoutreturn constant for both returns and recalls. (return_* works better for recall_type rather the other way around) Signed-off-by: Dean Hildebrand Signed-off-by: Marc Eshel Signed-off-by: Benny Halevy Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- include/linux/nfs4.h | 45 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 07e40c625972..6c0406e87d5c 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -471,6 +471,8 @@ enum lock_type4 { #define FATTR4_WORD1_TIME_MODIFY (1UL << 21) #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22) #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23) +#define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) +#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) #define NFSPROC4_NULL 0 #define NFSPROC4_COMPOUND 1 @@ -550,6 +552,49 @@ enum state_protect_how4 { SP4_SSV = 2 }; +enum pnfs_layouttype { + LAYOUT_NFSV4_1_FILES = 1, + LAYOUT_OSD2_OBJECTS = 2, + LAYOUT_BLOCK_VOLUME = 3, +}; + +/* used for both layout return and recall */ +enum pnfs_layoutreturn_type { + RETURN_FILE = 1, + RETURN_FSID = 2, + RETURN_ALL = 3 +}; + +enum pnfs_iomode { + IOMODE_READ = 1, + IOMODE_RW = 2, + IOMODE_ANY = 3, +}; + +enum pnfs_notify_deviceid_type4 { + NOTIFY_DEVICEID4_CHANGE = 1 << 1, + NOTIFY_DEVICEID4_DELETE = 1 << 2, +}; + +#define NFL4_UFLG_MASK 0x0000003F +#define NFL4_UFLG_DENSE 0x00000001 +#define NFL4_UFLG_COMMIT_THRU_MDS 0x00000002 +#define NFL4_UFLG_STRIPE_UNIT_SIZE_MASK 0xFFFFFFC0 + +/* Encoded in the loh_body field of type layouthint4 */ +enum filelayout_hint_care4 { + NFLH4_CARE_DENSE = NFL4_UFLG_DENSE, + NFLH4_CARE_COMMIT_THRU_MDS = NFL4_UFLG_COMMIT_THRU_MDS, + NFLH4_CARE_STRIPE_UNIT_SIZE = 0x00000040, + NFLH4_CARE_STRIPE_COUNT = 0x00000080 +}; + +#define NFS4_DEVICEID4_SIZE 16 + +struct nfs4_deviceid { + char data[NFS4_DEVICEID4_SIZE]; +}; + #endif #endif From 9449925273933d19235d7d36c1fd970841d055de Mon Sep 17 00:00:00 2001 From: Alexandros Batsakis Date: Wed, 20 Oct 2010 00:17:56 -0400 Subject: [PATCH 04/15] NFS: change stateid to be a union In NFSv4.1 the stateid consists of the other and seqid fields. For layout processing we need to numerically compare the seqid value of layout stateids. To do so, introduce a union to nfs4_stateid to switch between opaque(16 bytes) and opaque(12 bytes) / __be32 Signed-off-by: Alexandros Batsakis Signed-off-by: Benny Halevy Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 8 ++++---- include/linux/nfs4.h | 15 +++++++++++++-- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 930d10fecdaf..2950fca0c61b 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -118,11 +118,11 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n if (delegation == NULL) return 0; - /* seqid is 4-bytes long */ - if (((u32 *) &stateid->data)[0] != 0) + if (stateid->stateid.seqid != 0) return 0; - if (memcmp(&delegation->stateid.data[4], &stateid->data[4], - sizeof(stateid->data)-4)) + if (memcmp(&delegation->stateid.stateid.other, + &stateid->stateid.other, + NFS4_STATEID_OTHER_SIZE)) return 0; return 1; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 6c0406e87d5c..34da32436ac0 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -17,7 +17,9 @@ #define NFS4_BITMAP_SIZE 2 #define NFS4_VERIFIER_SIZE 8 -#define NFS4_STATEID_SIZE 16 +#define NFS4_STATEID_SEQID_SIZE 4 +#define NFS4_STATEID_OTHER_SIZE 12 +#define NFS4_STATEID_SIZE (NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE) #define NFS4_FHSIZE 128 #define NFS4_MAXPATHLEN PATH_MAX #define NFS4_MAXNAMLEN NAME_MAX @@ -167,7 +169,16 @@ struct nfs4_acl { }; typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier; -typedef struct { char data[NFS4_STATEID_SIZE]; } nfs4_stateid; + +struct nfs41_stateid { + __be32 seqid; + char other[NFS4_STATEID_OTHER_SIZE]; +} __attribute__ ((packed)); + +typedef union { + char data[NFS4_STATEID_SIZE]; + struct nfs41_stateid stateid; +} nfs4_stateid; enum nfs_opnum4 { OP_ACCESS = 3, From 504913fbc84c00bba7224d73e4aab525c1731f7d Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Wed, 20 Oct 2010 00:17:57 -0400 Subject: [PATCH 05/15] NFS: ask for layouttypes during v4 fsinfo call This information will be used to determine which layout driver, if any, to use for subsequent IO on this filesystem. Each driver is assigned an integer id, with 0 reserved to indicate no driver. The server can in theory return multiple ids. However, our current client implementation only notes the first entry and ignores the rest. Signed-off-by: Andy Adamson Signed-off-by: Benny Halevy Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 1 + fs/nfs/nfs4xdr.c | 58 +++++++++++++++++++++++++++++++++++++++++ include/linux/nfs_xdr.h | 1 + 3 files changed, 60 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e87fe612ca18..a5f1edb45b47 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -130,6 +130,7 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_LEASE_TIME, FATTR4_WORD1_TIME_DELTA + | FATTR4_WORD1_FS_LAYOUT_TYPES }; const u32 nfs4_fs_locations_bitmap[2] = { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index bd2101d918c8..8b4dfa393f0f 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3978,6 +3978,61 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep); } +/* + * Decode potentially multiple layout types. Currently we only support + * one layout driver per file system. + */ +static int decode_first_pnfs_layout_type(struct xdr_stream *xdr, + uint32_t *layouttype) +{ + uint32_t *p; + int num; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + num = be32_to_cpup(p); + + /* pNFS is not supported by the underlying file system */ + if (num == 0) { + *layouttype = 0; + return 0; + } + if (num > 1) + printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " + "per filesystem not supported\n", __func__); + + /* Decode and set first layout type, move xdr->p past unused types */ + p = xdr_inline_decode(xdr, num * 4); + if (unlikely(!p)) + goto out_overflow; + *layouttype = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * The type of file system exported. + * Note we must ensure that layouttype is set in any non-error case. + */ +static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, + uint32_t *layouttype) +{ + int status = 0; + + dprintk("%s: bitmap is %x\n", __func__, bitmap[1]); + if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) + return -EIO; + if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) { + status = decode_first_pnfs_layout_type(xdr, layouttype); + bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; + } else + *layouttype = 0; + return status; +} + static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) { __be32 *savep; @@ -4004,6 +4059,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) goto xdr_error; fsinfo->wtpref = fsinfo->wtmax; status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta); + if (status != 0) + goto xdr_error; + status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); if (status != 0) goto xdr_error; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index da7a1300dc60..065f9d105d05 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -114,6 +114,7 @@ struct nfs_fsinfo { __u64 maxfilesize; struct timespec time_delta; /* server time granularity */ __u32 lease_time; /* in seconds */ + __u32 layouttype; /* supported pnfs layout driver */ }; struct nfs_fsstat { From 85e174ba6b786ad336eb2df105b4f66d0932e70a Mon Sep 17 00:00:00 2001 From: Ricardo Labiaga Date: Wed, 20 Oct 2010 00:17:58 -0400 Subject: [PATCH 06/15] NFS: set layout driver Put in the infrastructure that uses information returned from the server at mount to select a layout driver module. In this patch, a stub is used that always returns "no driver found". Signed-off-by: Ricardo Labiaga Signed-off-by: Dean Hildebrand Signed-off-by: Marc Eshel Signed-off-by: Andy Adamson Signed-off-by: Benny Halevy Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/Makefile | 1 + fs/nfs/client.c | 5 +++ fs/nfs/pnfs.c | 84 +++++++++++++++++++++++++++++++++++++++ fs/nfs/pnfs.h | 56 ++++++++++++++++++++++++++ include/linux/nfs_fs.h | 1 + include/linux/nfs_fs_sb.h | 1 + 6 files changed, 148 insertions(+) create mode 100644 fs/nfs/pnfs.c create mode 100644 fs/nfs/pnfs.h diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index da7fda639eac..bb9e773d4312 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -15,5 +15,6 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ delegation.o idmap.o \ callback.o callback_xdr.o callback_proc.o \ nfs4namespace.o +nfs-$(CONFIG_NFS_V4_1) += pnfs.o nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o diff --git a/fs/nfs/client.c b/fs/nfs/client.c index a63bce8d0596..eba0bcc1bab0 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -48,6 +48,7 @@ #include "iostat.h" #include "internal.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_CLIENT @@ -900,6 +901,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + set_pnfs_layoutdriver(server, fsinfo->layouttype); + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); @@ -939,6 +942,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str } fsinfo.fattr = fattr; + fsinfo.layouttype = 0; error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) goto out_error; @@ -1021,6 +1025,7 @@ void nfs_free_server(struct nfs_server *server) { dprintk("--> nfs_free_server()\n"); + unset_pnfs_layoutdriver(server); spin_lock(&nfs_client_lock); list_del(&server->client_link); list_del(&server->master_link); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c new file mode 100644 index 000000000000..b483026e82aa --- /dev/null +++ b/fs/nfs/pnfs.c @@ -0,0 +1,84 @@ +/* + * pNFS functions to call and manage layout drivers. + * + * Copyright (c) 2002 [year of first publication] + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS + +/* STUB that returns the equivalent of "no module found" */ +static struct pnfs_layoutdriver_type * +find_pnfs_driver(u32 id) +{ + return NULL; +} + +void +unset_pnfs_layoutdriver(struct nfs_server *nfss) +{ + nfss->pnfs_curr_ld = NULL; +} + +/* + * Try to set the server's pnfs module to the pnfs layout type specified by id. + * Currently only one pNFS layout driver per filesystem is supported. + * + * @id layout type. Zero (illegal layout type) indicates pNFS not in use. + */ +void +set_pnfs_layoutdriver(struct nfs_server *server, u32 id) +{ + struct pnfs_layoutdriver_type *ld_type = NULL; + + if (id == 0) + goto out_no_driver; + if (!(server->nfs_client->cl_exchange_flags & + (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { + printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__, + id, server->nfs_client->cl_exchange_flags); + goto out_no_driver; + } + ld_type = find_pnfs_driver(id); + if (!ld_type) { + request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); + ld_type = find_pnfs_driver(id); + if (!ld_type) { + dprintk("%s: No pNFS module found for %u.\n", + __func__, id); + goto out_no_driver; + } + } + server->pnfs_curr_ld = ld_type; + dprintk("%s: pNFS module for %u set\n", __func__, id); + return; + +out_no_driver: + dprintk("%s: Using NFSv4 I/O\n", __func__); + server->pnfs_curr_ld = NULL; +} diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h new file mode 100644 index 000000000000..c628ef131d83 --- /dev/null +++ b/fs/nfs/pnfs.h @@ -0,0 +1,56 @@ +/* + * pNFS client data structures. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#ifndef FS_NFS_PNFS_H +#define FS_NFS_PNFS_H + +#ifdef CONFIG_NFS_V4_1 + +#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" + +/* Per-layout driver specific registration structure */ +struct pnfs_layoutdriver_type { +}; + +void set_pnfs_layoutdriver(struct nfs_server *, u32 id); +void unset_pnfs_layoutdriver(struct nfs_server *); + +#else /* CONFIG_NFS_V4_1 */ + +static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) +{ +} + +static inline void unset_pnfs_layoutdriver(struct nfs_server *s) +{ +} + +#endif /* CONFIG_NFS_V4_1 */ + +#endif /* FS_NFS_PNFS_H */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index d929b1883644..aba3da2a6227 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -615,6 +615,7 @@ nfs_fileid_to_ino_t(u64 fileid) #define NFSDBG_CLIENT 0x0200 #define NFSDBG_MOUNT 0x0400 #define NFSDBG_FSCACHE 0x0800 +#define NFSDBG_PNFS 0x1000 #define NFSDBG_ALL 0xFFFF #ifdef __KERNEL__ diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 5eef862ec187..c38619d95a57 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -145,6 +145,7 @@ struct nfs_server { u32 acl_bitmask; /* V4 bitmask representing the ACEs that are supported on this filesystem */ + struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ #endif void (*destroy)(struct nfs_server *); From 02c35fca7cf4ea2dfdc6db279e230cacbbf4b870 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Wed, 20 Oct 2010 00:17:59 -0400 Subject: [PATCH 07/15] NFSv4.1: pnfs: full mount/umount infrastructure Allow a module implementing a layout type to register, and have its mount/umount routines called for filesystems that the server declares support it. Signed-off-by: Fred Isaman Signed-off-by: Marc Eshel Signed-off-by: Andy Adamson Signed-off-by: Bian Naimeng Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- Documentation/filesystems/nfs/00-INDEX | 2 + Documentation/filesystems/nfs/pnfs.txt | 48 ++++++++++++++ fs/nfs/Kconfig | 8 ++- fs/nfs/pnfs.c | 88 +++++++++++++++++++++++++- fs/nfs/pnfs.h | 9 +++ 5 files changed, 151 insertions(+), 4 deletions(-) create mode 100644 Documentation/filesystems/nfs/pnfs.txt diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX index 3225a5662114..a57e12411d2a 100644 --- a/Documentation/filesystems/nfs/00-INDEX +++ b/Documentation/filesystems/nfs/00-INDEX @@ -12,6 +12,8 @@ nfs-rdma.txt - how to install and setup the Linux NFS/RDMA client and server software nfsroot.txt - short guide on setting up a diskless box with NFS root filesystem. +pnfs.txt + - short explanation of some of the internals of the pnfs client code rpc-cache.txt - introduction to the caching mechanisms in the sunrpc layer. idmapper.txt diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt new file mode 100644 index 000000000000..bc0b9cfe095b --- /dev/null +++ b/Documentation/filesystems/nfs/pnfs.txt @@ -0,0 +1,48 @@ +Reference counting in pnfs: +========================== + +The are several inter-related caches. We have layouts which can +reference multiple devices, each of which can reference multiple data servers. +Each data server can be referenced by multiple devices. Each device +can be referenced by multiple layouts. To keep all of this straight, +we need to reference count. + + +struct pnfs_layout_hdr +---------------------- +The on-the-wire command LAYOUTGET corresponds to struct +pnfs_layout_segment, usually referred to by the variable name lseg. +Each nfs_inode may hold a pointer to a cache of of these layout +segments in nfsi->layout, of type struct pnfs_layout_hdr. + +We reference the header for the inode pointing to it, across each +outstanding RPC call that references it (LAYOUTGET, LAYOUTRETURN, +LAYOUTCOMMIT), and for each lseg held within. + +Each header is also (when non-empty) put on a list associated with +struct nfs_client (cl_layouts). Being put on this list does not bump +the reference count, as the layout is kept around by the lseg that +keeps it in the list. + +deviceid_cache +-------------- +lsegs reference device ids, which are resolved per nfs_client and +layout driver type. The device ids are held in a RCU cache (struct +nfs4_deviceid_cache). The cache itself is referenced across each +mount. The entries (struct nfs4_deviceid) themselves are held across +the lifetime of each lseg referencing them. + +RCU is used because the deviceid is basically a write once, read many +data structure. The hlist size of 32 buckets needs better +justification, but seems reasonable given that we can have multiple +deviceid's per filesystem, and multiple filesystems per nfs_client. + +The hash code is copied from the nfsd code base. A discussion of +hashing and variations of this algorithm can be found at: +http://groups.google.com/group/comp.lang.c/browse_thread/thread/9522965e2b8d3809 + +data server cache +----------------- +file driver devices refer to data servers, which are kept in a module +level cache. Its reference is held over the lifetime of the deviceid +pointing to it. diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 3f69752d6f18..d94311943ec6 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -75,13 +75,17 @@ config NFS_V4 config NFS_V4_1 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" - depends on NFS_V4 && EXPERIMENTAL + depends on NFS_FS && NFS_V4 && EXPERIMENTAL + select PNFS_FILE_LAYOUT help This option enables support for minor version 1 of the NFSv4 protocol - (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. + (RFC 5661) in the kernel's NFS client. If unsure, say N. +config PNFS_FILE_LAYOUT + tristate + config ROOT_NFS bool "Root file system on NFS" depends on NFS_FS=y && IP_PNP diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index b483026e82aa..cf795625610e 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -32,16 +32,51 @@ #define NFSDBG_FACILITY NFSDBG_PNFS -/* STUB that returns the equivalent of "no module found" */ +/* Locking: + * + * pnfs_spinlock: + * protects pnfs_modules_tbl. + */ +static DEFINE_SPINLOCK(pnfs_spinlock); + +/* + * pnfs_modules_tbl holds all pnfs modules + */ +static LIST_HEAD(pnfs_modules_tbl); + +/* Return the registered pnfs layout driver module matching given id */ +static struct pnfs_layoutdriver_type * +find_pnfs_driver_locked(u32 id) +{ + struct pnfs_layoutdriver_type *local; + + list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) + if (local->id == id) + goto out; + local = NULL; +out: + dprintk("%s: Searching for id %u, found %p\n", __func__, id, local); + return local; +} + static struct pnfs_layoutdriver_type * find_pnfs_driver(u32 id) { - return NULL; + struct pnfs_layoutdriver_type *local; + + spin_lock(&pnfs_spinlock); + local = find_pnfs_driver_locked(id); + spin_unlock(&pnfs_spinlock); + return local; } void unset_pnfs_layoutdriver(struct nfs_server *nfss) { + if (nfss->pnfs_curr_ld) { + nfss->pnfs_curr_ld->uninitialize_mountpoint(nfss); + module_put(nfss->pnfs_curr_ld->owner); + } nfss->pnfs_curr_ld = NULL; } @@ -74,7 +109,18 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id) goto out_no_driver; } } + if (!try_module_get(ld_type->owner)) { + dprintk("%s: Could not grab reference on module\n", __func__); + goto out_no_driver; + } server->pnfs_curr_ld = ld_type; + if (ld_type->initialize_mountpoint(server)) { + printk(KERN_ERR + "%s: Error initializing mount point for layout driver %u.\n", + __func__, id); + module_put(ld_type->owner); + goto out_no_driver; + } dprintk("%s: pNFS module for %u set\n", __func__, id); return; @@ -82,3 +128,41 @@ out_no_driver: dprintk("%s: Using NFSv4 I/O\n", __func__); server->pnfs_curr_ld = NULL; } + +int +pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) +{ + int status = -EINVAL; + struct pnfs_layoutdriver_type *tmp; + + if (ld_type->id == 0) { + printk(KERN_ERR "%s id 0 is reserved\n", __func__); + return status; + } + + spin_lock(&pnfs_spinlock); + tmp = find_pnfs_driver_locked(ld_type->id); + if (!tmp) { + list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl); + status = 0; + dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id, + ld_type->name); + } else { + printk(KERN_ERR "%s Module with id %d already loaded!\n", + __func__, ld_type->id); + } + spin_unlock(&pnfs_spinlock); + + return status; +} +EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver); + +void +pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) +{ + dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); + spin_lock(&pnfs_spinlock); + list_del(&ld_type->pnfs_tblid); + spin_unlock(&pnfs_spinlock); +} +EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index c628ef131d83..61531f338576 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -36,8 +36,17 @@ /* Per-layout driver specific registration structure */ struct pnfs_layoutdriver_type { + struct list_head pnfs_tblid; + const u32 id; + const char *name; + struct module *owner; + int (*initialize_mountpoint) (struct nfs_server *); + int (*uninitialize_mountpoint) (struct nfs_server *); }; +extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); +extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); + void set_pnfs_layoutdriver(struct nfs_server *, u32 id); void unset_pnfs_layoutdriver(struct nfs_server *); From 7ab672ce312133ee4a5d85b71447b2b334403681 Mon Sep 17 00:00:00 2001 From: Dean Hildebrand Date: Wed, 20 Oct 2010 00:18:00 -0400 Subject: [PATCH 08/15] NFSv4.1: pnfs: filelayout: introduce minimal file layout driver This driver just registers itself and supplies trivial mount/umount functions. Signed-off-by: Dean Hildebrand Signed-off-by: Marc Eshel Signed-off-by: Benny Halevy Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/Makefile | 3 ++ fs/nfs/nfs4filelayout.c | 78 +++++++++++++++++++++++++++++++++++++++++ include/linux/nfs_fs.h | 1 + 3 files changed, 82 insertions(+) create mode 100644 fs/nfs/nfs4filelayout.c diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index bb9e773d4312..08a8889c5149 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -18,3 +18,6 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ nfs-$(CONFIG_NFS_V4_1) += pnfs.o nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o + +obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o +nfs_layout_nfsv41_files-y := nfs4filelayout.o diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c new file mode 100644 index 000000000000..696e283c2632 --- /dev/null +++ b/fs/nfs/nfs4filelayout.c @@ -0,0 +1,78 @@ +/* + * Module for the pnfs nfs4 file layout driver. + * Defines all I/O and Policy interface operations, plus code + * to register itself with the pNFS client. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dean Hildebrand "); +MODULE_DESCRIPTION("The NFSv4 file layout driver"); + +int +filelayout_initialize_mountpoint(struct nfs_server *nfss) +{ + return 0; +} + +int +filelayout_uninitialize_mountpoint(struct nfs_server *nfss) +{ + dprintk("--> %s\n", __func__); + + return 0; +} + +static struct pnfs_layoutdriver_type filelayout_type = { + .id = LAYOUT_NFSV4_1_FILES, + .name = "LAYOUT_NFSV4_1_FILES", + .owner = THIS_MODULE, + .initialize_mountpoint = filelayout_initialize_mountpoint, + .uninitialize_mountpoint = filelayout_uninitialize_mountpoint, +}; + +static int __init nfs4filelayout_init(void) +{ + printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", + __func__); + return pnfs_register_layoutdriver(&filelayout_type); +} + +static void __exit nfs4filelayout_exit(void) +{ + printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", + __func__); + pnfs_unregister_layoutdriver(&filelayout_type); +} + +module_init(nfs4filelayout_init); +module_exit(nfs4filelayout_exit); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index aba3da2a6227..499872fa895c 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -616,6 +616,7 @@ nfs_fileid_to_ino_t(u64 fileid) #define NFSDBG_MOUNT 0x0400 #define NFSDBG_FSCACHE 0x0800 #define NFSDBG_PNFS 0x1000 +#define NFSDBG_PNFS_LD 0x2000 #define NFSDBG_ALL 0xFFFF #ifdef __KERNEL__ From e5e940170b2136ad4d5483ef293ae284b9cc8d53 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 20 Oct 2010 00:18:01 -0400 Subject: [PATCH 09/15] NFS: create and destroy inode's layout cache At the start of the io paths, try to grab the relevant layout information. This will initiate the inode's layout cache, but stubs ensure the cache stays empty. Signed-off-by: Benny Halevy Signed-off-by: Dean Hildebrand Signed-off-by: Marc Eshel Signed-off-by: Tao Guo Signed-off-by: Ricardo Labiaga Signed-off-by: Boaz Harrosh Signed-off-by: Andy Adamson Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 5 ++ fs/nfs/inode.c | 3 + fs/nfs/pnfs.c | 140 +++++++++++++++++++++++++++++++++++++++++ fs/nfs/pnfs.h | 39 ++++++++++++ fs/nfs/read.c | 3 + include/linux/nfs_fs.h | 3 + 6 files changed, 193 insertions(+) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index c3f2477c16c1..91d019d39122 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -36,6 +36,7 @@ #include "internal.h" #include "iostat.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_FILE @@ -386,6 +387,10 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); + pnfs_update_layout(mapping->host, + nfs_file_open_context(file), + IOMODE_RW); + start: /* * Prevent starvation issues if someone is doing a consistency diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6eec28656415..314f57164602 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -48,6 +48,7 @@ #include "internal.h" #include "fscache.h" #include "dns_resolve.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -1410,6 +1411,7 @@ void nfs4_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); + pnfs_destroy_layout(NFS_I(inode)); /* If we are holding a delegation, return it! */ nfs_inode_return_delegation_noreclaim(inode); /* First call standard NFS clear_inode() code */ @@ -1447,6 +1449,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) nfsi->delegation = NULL; nfsi->delegation_state = 0; init_rwsem(&nfsi->rwsem); + nfsi->layout = NULL; #endif } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index cf795625610e..c0cd954855b9 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -166,3 +166,143 @@ pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) spin_unlock(&pnfs_spinlock); } EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); + +static void +get_layout_hdr_locked(struct pnfs_layout_hdr *lo) +{ + assert_spin_locked(&lo->inode->i_lock); + lo->refcount++; +} + +static void +put_layout_hdr_locked(struct pnfs_layout_hdr *lo) +{ + assert_spin_locked(&lo->inode->i_lock); + BUG_ON(lo->refcount == 0); + + lo->refcount--; + if (!lo->refcount) { + dprintk("%s: freeing layout cache %p\n", __func__, lo); + NFS_I(lo->inode)->layout = NULL; + kfree(lo); + } +} + +void +pnfs_destroy_layout(struct nfs_inode *nfsi) +{ + struct pnfs_layout_hdr *lo; + + spin_lock(&nfsi->vfs_inode.i_lock); + lo = nfsi->layout; + if (lo) { + /* Matched by refcount set to 1 in alloc_init_layout_hdr */ + put_layout_hdr_locked(lo); + } + spin_unlock(&nfsi->vfs_inode.i_lock); +} + +/* STUB - pretend LAYOUTGET to server failed */ +static struct pnfs_layout_segment * +send_layoutget(struct pnfs_layout_hdr *lo, + struct nfs_open_context *ctx, + u32 iomode) +{ + struct inode *ino = lo->inode; + + set_bit(lo_fail_bit(iomode), &lo->state); + spin_lock(&ino->i_lock); + put_layout_hdr_locked(lo); + spin_unlock(&ino->i_lock); + return NULL; +} + +static struct pnfs_layout_hdr * +alloc_init_layout_hdr(struct inode *ino) +{ + struct pnfs_layout_hdr *lo; + + lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL); + if (!lo) + return NULL; + lo->refcount = 1; + lo->inode = ino; + return lo; +} + +static struct pnfs_layout_hdr * +pnfs_find_alloc_layout(struct inode *ino) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *new = NULL; + + dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); + + assert_spin_locked(&ino->i_lock); + if (nfsi->layout) + return nfsi->layout; + + spin_unlock(&ino->i_lock); + new = alloc_init_layout_hdr(ino); + spin_lock(&ino->i_lock); + + if (likely(nfsi->layout == NULL)) /* Won the race? */ + nfsi->layout = new; + else + kfree(new); + return nfsi->layout; +} + +/* STUB - LAYOUTGET never succeeds, so cache is empty */ +static struct pnfs_layout_segment * +pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode) +{ + return NULL; +} + +/* + * Layout segment is retreived from the server if not cached. + * The appropriate layout segment is referenced and returned to the caller. + */ +struct pnfs_layout_segment * +pnfs_update_layout(struct inode *ino, + struct nfs_open_context *ctx, + enum pnfs_iomode iomode) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg = NULL; + + if (!pnfs_enabled_sb(NFS_SERVER(ino))) + return NULL; + spin_lock(&ino->i_lock); + lo = pnfs_find_alloc_layout(ino); + if (lo == NULL) { + dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); + goto out_unlock; + } + + /* Check to see if the layout for the given range already exists */ + lseg = pnfs_has_layout(lo, iomode); + if (lseg) { + dprintk("%s: Using cached lseg %p for iomode %d)\n", + __func__, lseg, iomode); + goto out_unlock; + } + + /* if LAYOUTGET already failed once we don't try again */ + if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state)) + goto out_unlock; + + get_layout_hdr_locked(lo); + spin_unlock(&ino->i_lock); + + lseg = send_layoutget(lo, ctx, iomode); +out: + dprintk("%s end, state 0x%lx lseg %p\n", __func__, + nfsi->layout->state, lseg); + return lseg; +out_unlock: + spin_unlock(&ino->i_lock); + goto out; +} diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 61531f338576..4ed1b48c71b1 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -34,6 +34,11 @@ #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" +enum { + NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ + NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ +}; + /* Per-layout driver specific registration structure */ struct pnfs_layoutdriver_type { struct list_head pnfs_tblid; @@ -44,14 +49,48 @@ struct pnfs_layoutdriver_type { int (*uninitialize_mountpoint) (struct nfs_server *); }; +struct pnfs_layout_hdr { + unsigned long refcount; + unsigned long state; + struct inode *inode; +}; + extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); +struct pnfs_layout_segment * +pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, + enum pnfs_iomode access_type); void set_pnfs_layoutdriver(struct nfs_server *, u32 id); void unset_pnfs_layoutdriver(struct nfs_server *); +void pnfs_destroy_layout(struct nfs_inode *); + + +static inline int lo_fail_bit(u32 iomode) +{ + return iomode == IOMODE_RW ? + NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; +} + +/* Return true if a layout driver is being used for this mountpoint */ +static inline int pnfs_enabled_sb(struct nfs_server *nfss) +{ + return nfss->pnfs_curr_ld != NULL; +} #else /* CONFIG_NFS_V4_1 */ +static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) +{ +} + +static inline struct pnfs_layout_segment * +pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, + enum pnfs_iomode access_type) +{ + return NULL; +} + static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) { } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 79859c81a943..e4b62c6f5a6e 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -25,6 +25,7 @@ #include "internal.h" #include "iostat.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -120,6 +121,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); + pnfs_update_layout(inode, ctx, IOMODE_READ); new = nfs_create_request(ctx, inode, page, 0, len); if (IS_ERR(new)) { unlock_page(page); @@ -624,6 +626,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ + pnfs_update_layout(inode, desc.ctx, IOMODE_READ); if (rsize < PAGE_CACHE_SIZE) nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); else diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 499872fa895c..0833bb67c831 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -188,6 +188,9 @@ struct nfs_inode { struct nfs_delegation *delegation; fmode_t delegation_state; struct rw_semaphore rwsem; + + /* pNFS layout information */ + struct pnfs_layout_hdr *layout; #endif /* CONFIG_NFS_V4*/ #ifdef CONFIG_NFS_FSCACHE struct fscache_cookie *fscache; From 974cec8ca0352eb5d281535b714cf194a606e98f Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Wed, 20 Oct 2010 00:18:02 -0400 Subject: [PATCH 10/15] NFS: client needs to maintain list of inodes with active layouts In particular, server reboot will invalidate all layouts. Note that in order to have an active layout, we must get a successful response from the server. To avoid adding that machinery, this patch just includes a stub that fakes up a successful return. Since the layout is never referenced for io, this is not a problem. Signed-off-by: Andy Adamson Signed-off-by: Benny Halevy Signed-off-by: Dean Hildebrand Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 4 +- fs/nfs/nfs4state.c | 2 + fs/nfs/pnfs.c | 160 +++++++++++++++++++++++++++++++++++++- fs/nfs/pnfs.h | 14 ++++ include/linux/nfs_fs_sb.h | 1 + 5 files changed, 177 insertions(+), 4 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index eba0bcc1bab0..e55ad211dbed 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -156,7 +156,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ cred = rpc_lookup_machine_cred(); if (!IS_ERR(cred)) clp->cl_machine_cred = cred; - +#if defined(CONFIG_NFS_V4_1) + INIT_LIST_HEAD(&clp->cl_layouts); +#endif nfs_fscache_get_client_cookie(clp); return clp; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e0dc06d74b6a..9c81f4d66950 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -54,6 +54,7 @@ #include "callback.h" #include "delegation.h" #include "internal.h" +#include "pnfs.h" #define OPENOWNER_POOL_SIZE 8 @@ -1475,6 +1476,7 @@ static void nfs4_state_manager(struct nfs_client *clp) } clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); + pnfs_destroy_all_layouts(clp); } if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c0cd954855b9..891a0c36f992 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -28,6 +28,7 @@ */ #include +#include "internal.h" #include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PNFS @@ -183,38 +184,189 @@ put_layout_hdr_locked(struct pnfs_layout_hdr *lo) lo->refcount--; if (!lo->refcount) { dprintk("%s: freeing layout cache %p\n", __func__, lo); + BUG_ON(!list_empty(&lo->layouts)); NFS_I(lo->inode)->layout = NULL; kfree(lo); } } +static void +put_layout_hdr(struct inode *inode) +{ + spin_lock(&inode->i_lock); + put_layout_hdr_locked(NFS_I(inode)->layout); + spin_unlock(&inode->i_lock); +} + +static void +init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) +{ + INIT_LIST_HEAD(&lseg->fi_list); + kref_init(&lseg->kref); + lseg->layout = lo; +} + +/* Called without i_lock held, as the free_lseg call may sleep */ +static void +destroy_lseg(struct kref *kref) +{ + struct pnfs_layout_segment *lseg = + container_of(kref, struct pnfs_layout_segment, kref); + struct inode *ino = lseg->layout->inode; + + dprintk("--> %s\n", __func__); + kfree(lseg); + /* Matched by get_layout_hdr_locked in pnfs_insert_layout */ + put_layout_hdr(ino); +} + +static void +put_lseg(struct pnfs_layout_segment *lseg) +{ + if (!lseg) + return; + + dprintk("%s: lseg %p ref %d\n", __func__, lseg, + atomic_read(&lseg->kref.refcount)); + kref_put(&lseg->kref, destroy_lseg); +} + +static void +pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list) +{ + struct pnfs_layout_segment *lseg, *next; + struct nfs_client *clp; + + dprintk("%s:Begin lo %p\n", __func__, lo); + + assert_spin_locked(&lo->inode->i_lock); + list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) { + dprintk("%s: freeing lseg %p\n", __func__, lseg); + list_move(&lseg->fi_list, tmp_list); + } + clp = NFS_SERVER(lo->inode)->nfs_client; + spin_lock(&clp->cl_lock); + /* List does not take a reference, so no need for put here */ + list_del_init(&lo->layouts); + spin_unlock(&clp->cl_lock); + + dprintk("%s:Return\n", __func__); +} + +static void +pnfs_free_lseg_list(struct list_head *tmp_list) +{ + struct pnfs_layout_segment *lseg; + + while (!list_empty(tmp_list)) { + lseg = list_entry(tmp_list->next, struct pnfs_layout_segment, + fi_list); + dprintk("%s calling put_lseg on %p\n", __func__, lseg); + list_del(&lseg->fi_list); + put_lseg(lseg); + } +} + void pnfs_destroy_layout(struct nfs_inode *nfsi) { struct pnfs_layout_hdr *lo; + LIST_HEAD(tmp_list); spin_lock(&nfsi->vfs_inode.i_lock); lo = nfsi->layout; if (lo) { + pnfs_clear_lseg_list(lo, &tmp_list); /* Matched by refcount set to 1 in alloc_init_layout_hdr */ put_layout_hdr_locked(lo); } spin_unlock(&nfsi->vfs_inode.i_lock); + pnfs_free_lseg_list(&tmp_list); } -/* STUB - pretend LAYOUTGET to server failed */ +/* + * Called by the state manger to remove all layouts established under an + * expired lease. + */ +void +pnfs_destroy_all_layouts(struct nfs_client *clp) +{ + struct pnfs_layout_hdr *lo; + LIST_HEAD(tmp_list); + + spin_lock(&clp->cl_lock); + list_splice_init(&clp->cl_layouts, &tmp_list); + spin_unlock(&clp->cl_lock); + + while (!list_empty(&tmp_list)) { + lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, + layouts); + dprintk("%s freeing layout for inode %lu\n", __func__, + lo->inode->i_ino); + pnfs_destroy_layout(NFS_I(lo->inode)); + } +} + +static void pnfs_insert_layout(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg); + +/* Get layout from server. */ static struct pnfs_layout_segment * send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_open_context *ctx, u32 iomode) { struct inode *ino = lo->inode; + struct pnfs_layout_segment *lseg; - set_bit(lo_fail_bit(iomode), &lo->state); + /* Lets pretend we sent LAYOUTGET and got a response */ + lseg = kzalloc(sizeof(*lseg), GFP_KERNEL); + if (!lseg) { + set_bit(lo_fail_bit(iomode), &lo->state); + spin_lock(&ino->i_lock); + put_layout_hdr_locked(lo); + spin_unlock(&ino->i_lock); + return NULL; + } + init_lseg(lo, lseg); + lseg->iomode = IOMODE_RW; spin_lock(&ino->i_lock); + pnfs_insert_layout(lo, lseg); put_layout_hdr_locked(lo); spin_unlock(&ino->i_lock); - return NULL; + return lseg; +} + +static void +pnfs_insert_layout(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) +{ + dprintk("%s:Begin\n", __func__); + + assert_spin_locked(&lo->inode->i_lock); + if (list_empty(&lo->segs)) { + struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client; + + spin_lock(&clp->cl_lock); + BUG_ON(!list_empty(&lo->layouts)); + list_add_tail(&lo->layouts, &clp->cl_layouts); + spin_unlock(&clp->cl_lock); + } + get_layout_hdr_locked(lo); + /* STUB - add the constructed lseg if necessary */ + if (list_empty(&lo->segs)) { + list_add_tail(&lseg->fi_list, &lo->segs); + dprintk("%s: inserted lseg %p iomode %d at tail\n", + __func__, lseg, lseg->iomode); + } else { + /* There is no harm for the moment in calling this + * with the lock held, and the call will be removed + * with the STUB. + */ + put_lseg(lseg); + } + + dprintk("%s:Return\n", __func__); } static struct pnfs_layout_hdr * @@ -226,6 +378,8 @@ alloc_init_layout_hdr(struct inode *ino) if (!lo) return NULL; lo->refcount = 1; + INIT_LIST_HEAD(&lo->layouts); + INIT_LIST_HEAD(&lo->segs); lo->inode = ino; return lo; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 4ed1b48c71b1..1c3eb02f4944 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -30,6 +30,13 @@ #ifndef FS_NFS_PNFS_H #define FS_NFS_PNFS_H +struct pnfs_layout_segment { + struct list_head fi_list; + u32 iomode; + struct kref kref; + struct pnfs_layout_hdr *layout; +}; + #ifdef CONFIG_NFS_V4_1 #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" @@ -51,6 +58,8 @@ struct pnfs_layoutdriver_type { struct pnfs_layout_hdr { unsigned long refcount; + struct list_head layouts; /* other client layouts */ + struct list_head segs; /* layout segments list */ unsigned long state; struct inode *inode; }; @@ -64,6 +73,7 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, void set_pnfs_layoutdriver(struct nfs_server *, u32 id); void unset_pnfs_layoutdriver(struct nfs_server *); void pnfs_destroy_layout(struct nfs_inode *); +void pnfs_destroy_all_layouts(struct nfs_client *); static inline int lo_fail_bit(u32 iomode) @@ -80,6 +90,10 @@ static inline int pnfs_enabled_sb(struct nfs_server *nfss) #else /* CONFIG_NFS_V4_1 */ +static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) +{ +} + static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) { } diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index c38619d95a57..4d62f1581ed1 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -82,6 +82,7 @@ struct nfs_client { /* The flags used for obtaining the clientid during EXCHANGE_ID */ u32 cl_exchange_flags; struct nfs4_session *cl_session; /* sharred session */ + struct list_head cl_layouts; #endif /* CONFIG_NFS_V4_1 */ #ifdef CONFIG_NFS_FSCACHE From b1f69b754ee312ec75f2c7ead0e6851cd9598cc2 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Wed, 20 Oct 2010 00:18:03 -0400 Subject: [PATCH 11/15] NFSv4.1: pnfs: add LAYOUTGET and GETDEVICEINFO infrastructure Add the ability to actually send LAYOUTGET and GETDEVICEINFO. This also adds in the machinery to handle layout state and the deviceid cache. Note that GETDEVICEINFO is not called directly by the generic layer. Instead it is called by the drivers while parsing the LAYOUTGET opaque data in response to an unknown device id embedded therein. RFC 5661 only encodes device ids within the driver-specific opaque data. Signed-off-by: Andy Adamson Signed-off-by: Dean Hildebrand Signed-off-by: Marc Eshel Signed-off-by: Mike Sager Signed-off-by: Ricardo Labiaga Signed-off-by: Tao Guo Signed-off-by: Boaz Harrosh Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 142 ++++++++++++++ fs/nfs/nfs4xdr.c | 302 +++++++++++++++++++++++++++++ fs/nfs/pnfs.c | 389 ++++++++++++++++++++++++++++++++++---- fs/nfs/pnfs.h | 73 ++++++- include/linux/nfs4.h | 2 + include/linux/nfs_fs_sb.h | 1 + include/linux/nfs_xdr.h | 49 +++++ 7 files changed, 923 insertions(+), 35 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a5f1edb45b47..7e14e991ddfa 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -55,6 +55,7 @@ #include "internal.h" #include "iostat.h" #include "callback.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -5256,6 +5257,147 @@ out: dprintk("<-- %s status=%d\n", __func__, status); return status; } + +static void +nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + struct inode *ino = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(ino); + + dprintk("--> %s\n", __func__); + if (nfs4_setup_sequence(server, &lgp->args.seq_args, + &lgp->res.seq_res, 0, task)) + return; + rpc_call_start(task); +} + +static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + struct nfs_server *server = NFS_SERVER(lgp->args.inode); + + dprintk("--> %s\n", __func__); + + if (!nfs4_sequence_done(task, &lgp->res.seq_res)) + return; + + switch (task->tk_status) { + case 0: + break; + case -NFS4ERR_LAYOUTTRYLATER: + case -NFS4ERR_RECALLCONFLICT: + task->tk_status = -NFS4ERR_DELAY; + /* Fall through */ + default: + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + } + lgp->status = task->tk_status; + dprintk("<-- %s\n", __func__); +} + +static void nfs4_layoutget_release(void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + + dprintk("--> %s\n", __func__); + put_layout_hdr(lgp->args.inode); + if (lgp->res.layout.buf != NULL) + free_page((unsigned long) lgp->res.layout.buf); + put_nfs_open_context(lgp->args.ctx); + kfree(calldata); + dprintk("<-- %s\n", __func__); +} + +static const struct rpc_call_ops nfs4_layoutget_call_ops = { + .rpc_call_prepare = nfs4_layoutget_prepare, + .rpc_call_done = nfs4_layoutget_done, + .rpc_release = nfs4_layoutget_release, +}; + +int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) +{ + struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], + .rpc_argp = &lgp->args, + .rpc_resp = &lgp->res, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs4_layoutget_call_ops, + .callback_data = lgp, + .flags = RPC_TASK_ASYNC, + }; + int status = 0; + + dprintk("--> %s\n", __func__); + + lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); + if (lgp->res.layout.buf == NULL) { + nfs4_layoutget_release(lgp); + return -ENOMEM; + } + + lgp->res.seq_res.sr_slot = NULL; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) + goto out; + status = lgp->status; + if (status != 0) + goto out; + status = pnfs_layout_process(lgp); +out: + rpc_put_task(task); + dprintk("<-- %s status=%d\n", __func__, status); + return status; +} + +static int +_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) +{ + struct nfs4_getdeviceinfo_args args = { + .pdev = pdev, + }; + struct nfs4_getdeviceinfo_res res = { + .pdev = pdev, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + dprintk("--> %s\n", __func__); + status = nfs4_call_sync(server, &msg, &args, &res, 0); + dprintk("<-- %s status=%d\n", __func__, status); + + return status; +} + +int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = nfs4_handle_exception(server, + _nfs4_proc_getdeviceinfo(server, pdev), + &exception); + } while (exception.retry); + return err; +} +EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo); + #endif /* CONFIG_NFS_V4_1 */ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 8b4dfa393f0f..f313c4cce7e4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -52,6 +52,7 @@ #include #include "nfs4_fs.h" #include "internal.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_XDR @@ -310,6 +311,19 @@ static int nfs4_stat_to_errno(int); XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) +#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ + XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) +#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ + 1 /* layout type */ + \ + 1 /* opaque devaddr4 length */ + \ + /* devaddr4 payload is read into page */ \ + 1 /* notification bitmap length */ + \ + 1 /* notification bitmap */) +#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ + encode_stateid_maxsz) +#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ + decode_stateid_maxsz + \ + XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 @@ -699,6 +713,20 @@ static int nfs4_stat_to_errno(int); #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_reclaim_complete_maxsz) +#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_getdeviceinfo_maxsz) +#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_getdeviceinfo_maxsz) +#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_layoutget_maxsz) +#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutget_maxsz) const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + compound_encode_hdr_maxsz + @@ -1737,6 +1765,58 @@ static void encode_sequence(struct xdr_stream *xdr, #endif /* CONFIG_NFS_V4_1 */ } +#ifdef CONFIG_NFS_V4_1 +static void +encode_getdeviceinfo(struct xdr_stream *xdr, + const struct nfs4_getdeviceinfo_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE); + *p++ = cpu_to_be32(OP_GETDEVICEINFO); + p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, + NFS4_DEVICEID4_SIZE); + *p++ = cpu_to_be32(args->pdev->layout_type); + *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */ + *p++ = cpu_to_be32(0); /* bitmap length 0 */ + hdr->nops++; + hdr->replen += decode_getdeviceinfo_maxsz; +} + +static void +encode_layoutget(struct xdr_stream *xdr, + const struct nfs4_layoutget_args *args, + struct compound_hdr *hdr) +{ + nfs4_stateid stateid; + __be32 *p; + + p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_LAYOUTGET); + *p++ = cpu_to_be32(0); /* Signal layout available */ + *p++ = cpu_to_be32(args->type); + *p++ = cpu_to_be32(args->range.iomode); + p = xdr_encode_hyper(p, args->range.offset); + p = xdr_encode_hyper(p, args->range.length); + p = xdr_encode_hyper(p, args->minlength); + pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout, + args->ctx->state); + p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->maxcount); + + dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", + __func__, + args->type, + args->range.iomode, + (unsigned long)args->range.offset, + (unsigned long)args->range.length, + args->maxcount); + hdr->nops++; + hdr->replen += decode_layoutget_maxsz; +} +#endif /* CONFIG_NFS_V4_1 */ + /* * END OF "GENERIC" ENCODE ROUTINES. */ @@ -2554,6 +2634,51 @@ static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p, return 0; } +/* + * Encode GETDEVICEINFO request + */ +static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, + struct nfs4_getdeviceinfo_args *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_sequence(&xdr, &args->seq_args, &hdr); + encode_getdeviceinfo(&xdr, args, &hdr); + + /* set up reply kvec. Subtract notification bitmap max size (2) + * so that notification bitmap is put in xdr_buf tail */ + xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2, + args->pdev->pages, args->pdev->pgbase, + args->pdev->pglen); + + encode_nops(&hdr); + return 0; +} + +/* + * Encode LAYOUTGET request + */ +static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p, + struct nfs4_layoutget_args *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_sequence(&xdr, &args->seq_args, &hdr); + encode_putfh(&xdr, NFS_FH(args->inode), &hdr); + encode_layoutget(&xdr, args, &hdr); + encode_nops(&hdr); + return 0; +} #endif /* CONFIG_NFS_V4_1 */ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -4830,6 +4955,134 @@ out_overflow: #endif /* CONFIG_NFS_V4_1 */ } +#if defined(CONFIG_NFS_V4_1) + +static int decode_getdeviceinfo(struct xdr_stream *xdr, + struct pnfs_device *pdev) +{ + __be32 *p; + uint32_t len, type; + int status; + + status = decode_op_hdr(xdr, OP_GETDEVICEINFO); + if (status) { + if (status == -ETOOSMALL) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + pdev->mincount = be32_to_cpup(p); + dprintk("%s: Min count too small. mincnt = %u\n", + __func__, pdev->mincount); + } + return status; + } + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + type = be32_to_cpup(p++); + if (type != pdev->layout_type) { + dprintk("%s: layout mismatch req: %u pdev: %u\n", + __func__, pdev->layout_type, type); + return -EINVAL; + } + /* + * Get the length of the opaque device_addr4. xdr_read_pages places + * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages) + * and places the remaining xdr data in xdr_buf->tail + */ + pdev->mincount = be32_to_cpup(p); + xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ + + /* Parse notification bitmap, verifying that it is zero. */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + if (len) { + int i; + + p = xdr_inline_decode(xdr, 4 * len); + if (unlikely(!p)) + goto out_overflow; + for (i = 0; i < len; i++, p++) { + if (be32_to_cpup(p)) { + dprintk("%s: notifications not supported\n", + __func__); + return -EIO; + } + } + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs4_layoutget_res *res) +{ + __be32 *p; + int status; + u32 layout_count; + + status = decode_op_hdr(xdr, OP_LAYOUTGET); + if (status) + return status; + p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); + if (unlikely(!p)) + goto out_overflow; + res->return_on_close = be32_to_cpup(p++); + p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE); + layout_count = be32_to_cpup(p); + if (!layout_count) { + dprintk("%s: server responded with empty layout array\n", + __func__); + return -EINVAL; + } + + p = xdr_inline_decode(xdr, 24); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &res->range.offset); + p = xdr_decode_hyper(p, &res->range.length); + res->range.iomode = be32_to_cpup(p++); + res->type = be32_to_cpup(p++); + + status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p); + if (unlikely(status)) + return status; + + dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", + __func__, + (unsigned long)res->range.offset, + (unsigned long)res->range.length, + res->range.iomode, + res->type, + res->layout.len); + + /* nfs4_proc_layoutget allocated a single page */ + if (res->layout.len > PAGE_SIZE) + return -ENOMEM; + memcpy(res->layout.buf, p, res->layout.len); + + if (layout_count > 1) { + /* We only handle a length one array at the moment. Any + * further entries are just ignored. Note that this means + * the client may see a response that is less than the + * minimum it requested. + */ + dprintk("%s: server responded with %d layouts, dropping tail\n", + __func__, layout_count); + } + + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} +#endif /* CONFIG_NFS_V4_1 */ + /* * END OF "GENERIC" DECODE ROUTINES. */ @@ -5857,6 +6110,53 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p, status = decode_reclaim_complete(&xdr, (void *)NULL); return status; } + +/* + * Decode GETDEVINFO response + */ +static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p, + struct nfs4_getdeviceinfo_res *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status != 0) + goto out; + status = decode_sequence(&xdr, &res->seq_res, rqstp); + if (status != 0) + goto out; + status = decode_getdeviceinfo(&xdr, res->pdev); +out: + return status; +} + +/* + * Decode LAYOUTGET response + */ +static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p, + struct nfs4_layoutget_res *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_sequence(&xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_layoutget(&xdr, rqstp, res); +out: + return status; +} #endif /* CONFIG_NFS_V4_1 */ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, @@ -6048,6 +6348,8 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SEQUENCE, enc_sequence, dec_sequence), PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), + PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), + PROC(LAYOUTGET, enc_layoutget, dec_layoutget), #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 891a0c36f992..d1ad7df3479e 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -140,6 +140,11 @@ pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) printk(KERN_ERR "%s id 0 is reserved\n", __func__); return status; } + if (!ld_type->alloc_lseg || !ld_type->free_lseg) { + printk(KERN_ERR "%s Layout driver must provide " + "alloc_lseg and free_lseg.\n", __func__); + return status; + } spin_lock(&pnfs_spinlock); tmp = find_pnfs_driver_locked(ld_type->id); @@ -168,6 +173,10 @@ pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) } EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); +/* + * pNFS client layout cache + */ + static void get_layout_hdr_locked(struct pnfs_layout_hdr *lo) { @@ -190,7 +199,7 @@ put_layout_hdr_locked(struct pnfs_layout_hdr *lo) } } -static void +void put_layout_hdr(struct inode *inode) { spin_lock(&inode->i_lock); @@ -215,7 +224,7 @@ destroy_lseg(struct kref *kref) struct inode *ino = lseg->layout->inode; dprintk("--> %s\n", __func__); - kfree(lseg); + NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); /* Matched by get_layout_hdr_locked in pnfs_insert_layout */ put_layout_hdr(ino); } @@ -249,6 +258,9 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list) /* List does not take a reference, so no need for put here */ list_del_init(&lo->layouts); spin_unlock(&clp->cl_lock); + write_seqlock(&lo->seqlock); + clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state); + write_sequnlock(&lo->seqlock); dprintk("%s:Return\n", __func__); } @@ -307,40 +319,135 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) } } -static void pnfs_insert_layout(struct pnfs_layout_hdr *lo, - struct pnfs_layout_segment *lseg); +/* update lo->stateid with new if is more recent + * + * lo->stateid could be the open stateid, in which case we just use what given. + */ +static void +pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new) +{ + nfs4_stateid *old = &lo->stateid; + bool overwrite = false; -/* Get layout from server. */ + write_seqlock(&lo->seqlock); + if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) || + memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other))) + overwrite = true; + else { + u32 oldseq, newseq; + + oldseq = be32_to_cpu(old->stateid.seqid); + newseq = be32_to_cpu(new->stateid.seqid); + if ((int)(newseq - oldseq) > 0) + overwrite = true; + } + if (overwrite) + memcpy(&old->stateid, &new->stateid, sizeof(new->stateid)); + write_sequnlock(&lo->seqlock); +} + +static void +pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo, + struct nfs4_state *state) +{ + int seq; + + dprintk("--> %s\n", __func__); + write_seqlock(&lo->seqlock); + do { + seq = read_seqbegin(&state->seqlock); + memcpy(lo->stateid.data, state->stateid.data, + sizeof(state->stateid.data)); + } while (read_seqretry(&state->seqlock, seq)); + set_bit(NFS_LAYOUT_STATEID_SET, &lo->state); + write_sequnlock(&lo->seqlock); + dprintk("<-- %s\n", __func__); +} + +void +pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, + struct nfs4_state *open_state) +{ + int seq; + + dprintk("--> %s\n", __func__); + do { + seq = read_seqbegin(&lo->seqlock); + if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) { + /* This will trigger retry of the read */ + pnfs_layout_from_open_stateid(lo, open_state); + } else + memcpy(dst->data, lo->stateid.data, + sizeof(lo->stateid.data)); + } while (read_seqretry(&lo->seqlock, seq)); + dprintk("<-- %s\n", __func__); +} + +/* +* Get layout from server. +* for now, assume that whole file layouts are requested. +* arg->offset: 0 +* arg->length: all ones +*/ static struct pnfs_layout_segment * send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_open_context *ctx, u32 iomode) { struct inode *ino = lo->inode; - struct pnfs_layout_segment *lseg; + struct nfs_server *server = NFS_SERVER(ino); + struct nfs4_layoutget *lgp; + struct pnfs_layout_segment *lseg = NULL; - /* Lets pretend we sent LAYOUTGET and got a response */ - lseg = kzalloc(sizeof(*lseg), GFP_KERNEL); - if (!lseg) { - set_bit(lo_fail_bit(iomode), &lo->state); - spin_lock(&ino->i_lock); - put_layout_hdr_locked(lo); - spin_unlock(&ino->i_lock); + dprintk("--> %s\n", __func__); + + BUG_ON(ctx == NULL); + lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); + if (lgp == NULL) { + put_layout_hdr(lo->inode); return NULL; } - init_lseg(lo, lseg); - lseg->iomode = IOMODE_RW; - spin_lock(&ino->i_lock); - pnfs_insert_layout(lo, lseg); - put_layout_hdr_locked(lo); - spin_unlock(&ino->i_lock); + lgp->args.minlength = NFS4_MAX_UINT64; + lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; + lgp->args.range.iomode = iomode; + lgp->args.range.offset = 0; + lgp->args.range.length = NFS4_MAX_UINT64; + lgp->args.type = server->pnfs_curr_ld->id; + lgp->args.inode = ino; + lgp->args.ctx = get_nfs_open_context(ctx); + lgp->lsegpp = &lseg; + + /* Synchronously retrieve layout information from server and + * store in lseg. + */ + nfs4_proc_layoutget(lgp); + if (!lseg) { + /* remember that LAYOUTGET failed and suspend trying */ + set_bit(lo_fail_bit(iomode), &lo->state); + } return lseg; } +/* + * Compare two layout segments for sorting into layout cache. + * We want to preferentially return RW over RO layouts, so ensure those + * are seen first. + */ +static s64 +cmp_layout(u32 iomode1, u32 iomode2) +{ + /* read > read/write */ + return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ); +} + static void pnfs_insert_layout(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) { + struct pnfs_layout_segment *lp; + int found = 0; + dprintk("%s:Begin\n", __func__); assert_spin_locked(&lo->inode->i_lock); @@ -352,19 +459,28 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo, list_add_tail(&lo->layouts, &clp->cl_layouts); spin_unlock(&clp->cl_lock); } - get_layout_hdr_locked(lo); - /* STUB - add the constructed lseg if necessary */ - if (list_empty(&lo->segs)) { - list_add_tail(&lseg->fi_list, &lo->segs); - dprintk("%s: inserted lseg %p iomode %d at tail\n", - __func__, lseg, lseg->iomode); - } else { - /* There is no harm for the moment in calling this - * with the lock held, and the call will be removed - * with the STUB. - */ - put_lseg(lseg); + list_for_each_entry(lp, &lo->segs, fi_list) { + if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0) + continue; + list_add_tail(&lseg->fi_list, &lp->fi_list); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu before " + "lp %p iomode %d offset %llu length %llu\n", + __func__, lseg, lseg->range.iomode, + lseg->range.offset, lseg->range.length, + lp, lp->range.iomode, lp->range.offset, + lp->range.length); + found = 1; + break; } + if (!found) { + list_add_tail(&lseg->fi_list, &lo->segs); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu at tail\n", + __func__, lseg, lseg->range.iomode, + lseg->range.offset, lseg->range.length); + } + get_layout_hdr_locked(lo); dprintk("%s:Return\n", __func__); } @@ -380,6 +496,7 @@ alloc_init_layout_hdr(struct inode *ino) lo->refcount = 1; INIT_LIST_HEAD(&lo->layouts); INIT_LIST_HEAD(&lo->segs); + seqlock_init(&lo->seqlock); lo->inode = ino; return lo; } @@ -407,11 +524,46 @@ pnfs_find_alloc_layout(struct inode *ino) return nfsi->layout; } -/* STUB - LAYOUTGET never succeeds, so cache is empty */ +/* + * iomode matching rules: + * iomode lseg match + * ----- ----- ----- + * ANY READ true + * ANY RW true + * RW READ false + * RW RW true + * READ READ true + * READ RW true + */ +static int +is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode) +{ + return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW); +} + +/* + * lookup range in layout + */ static struct pnfs_layout_segment * pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode) { - return NULL; + struct pnfs_layout_segment *lseg, *ret = NULL; + + dprintk("%s:Begin\n", __func__); + + assert_spin_locked(&lo->inode->i_lock); + list_for_each_entry(lseg, &lo->segs, fi_list) { + if (is_matching_lseg(lseg, iomode)) { + ret = lseg; + break; + } + if (cmp_layout(iomode, lseg->range.iomode) > 0) + break; + } + + dprintk("%s:Return lseg %p ref %d\n", + __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0); + return ret; } /* @@ -448,7 +600,7 @@ pnfs_update_layout(struct inode *ino, if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state)) goto out_unlock; - get_layout_hdr_locked(lo); + get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */ spin_unlock(&ino->i_lock); lseg = send_layoutget(lo, ctx, iomode); @@ -460,3 +612,172 @@ out_unlock: spin_unlock(&ino->i_lock); goto out; } + +int +pnfs_layout_process(struct nfs4_layoutget *lgp) +{ + struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; + struct nfs4_layoutget_res *res = &lgp->res; + struct pnfs_layout_segment *lseg; + struct inode *ino = lo->inode; + int status = 0; + + /* Inject layout blob into I/O device driver */ + lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res); + if (!lseg || IS_ERR(lseg)) { + if (!lseg) + status = -ENOMEM; + else + status = PTR_ERR(lseg); + dprintk("%s: Could not allocate layout: error %d\n", + __func__, status); + goto out; + } + + spin_lock(&ino->i_lock); + init_lseg(lo, lseg); + lseg->range = res->range; + *lgp->lsegpp = lseg; + pnfs_insert_layout(lo, lseg); + + /* Done processing layoutget. Set the layout stateid */ + pnfs_set_layout_stateid(lo, &res->stateid); + spin_unlock(&ino->i_lock); +out: + return status; +} + +/* + * Device ID cache. Currently supports one layout type per struct nfs_client. + * Add layout type to the lookup key to expand to support multiple types. + */ +int +pnfs_alloc_init_deviceid_cache(struct nfs_client *clp, + void (*free_callback)(struct pnfs_deviceid_node *)) +{ + struct pnfs_deviceid_cache *c; + + c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL); + if (!c) + return -ENOMEM; + spin_lock(&clp->cl_lock); + if (clp->cl_devid_cache != NULL) { + atomic_inc(&clp->cl_devid_cache->dc_ref); + dprintk("%s [kref [%d]]\n", __func__, + atomic_read(&clp->cl_devid_cache->dc_ref)); + kfree(c); + } else { + /* kzalloc initializes hlists */ + spin_lock_init(&c->dc_lock); + atomic_set(&c->dc_ref, 1); + c->dc_free_callback = free_callback; + clp->cl_devid_cache = c; + dprintk("%s [new]\n", __func__); + } + spin_unlock(&clp->cl_lock); + return 0; +} +EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache); + +/* + * Called from pnfs_layoutdriver_type->free_lseg + * last layout segment reference frees deviceid + */ +void +pnfs_put_deviceid(struct pnfs_deviceid_cache *c, + struct pnfs_deviceid_node *devid) +{ + struct nfs4_deviceid *id = &devid->de_id; + struct pnfs_deviceid_node *d; + struct hlist_node *n; + long h = nfs4_deviceid_hash(id); + + dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref)); + if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock)) + return; + + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node) + if (!memcmp(&d->de_id, id, sizeof(*id))) { + hlist_del_rcu(&d->de_node); + spin_unlock(&c->dc_lock); + synchronize_rcu(); + c->dc_free_callback(devid); + return; + } + spin_unlock(&c->dc_lock); + /* Why wasn't it found in the list? */ + BUG(); +} +EXPORT_SYMBOL_GPL(pnfs_put_deviceid); + +/* Find and reference a deviceid */ +struct pnfs_deviceid_node * +pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id) +{ + struct pnfs_deviceid_node *d; + struct hlist_node *n; + long hash = nfs4_deviceid_hash(id); + + dprintk("--> %s hash %ld\n", __func__, hash); + rcu_read_lock(); + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { + if (!memcmp(&d->de_id, id, sizeof(*id))) { + if (!atomic_inc_not_zero(&d->de_ref)) { + goto fail; + } else { + rcu_read_unlock(); + return d; + } + } + } +fail: + rcu_read_unlock(); + return NULL; +} +EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid); + +/* + * Add a deviceid to the cache. + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new + */ +struct pnfs_deviceid_node * +pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new) +{ + struct pnfs_deviceid_node *d; + long hash = nfs4_deviceid_hash(&new->de_id); + + dprintk("--> %s hash %ld\n", __func__, hash); + spin_lock(&c->dc_lock); + d = pnfs_find_get_deviceid(c, &new->de_id); + if (d) { + spin_unlock(&c->dc_lock); + dprintk("%s [discard]\n", __func__); + c->dc_free_callback(new); + return d; + } + INIT_HLIST_NODE(&new->de_node); + atomic_set(&new->de_ref, 1); + hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); + spin_unlock(&c->dc_lock); + dprintk("%s [new]\n", __func__); + return new; +} +EXPORT_SYMBOL_GPL(pnfs_add_deviceid); + +void +pnfs_put_deviceid_cache(struct nfs_client *clp) +{ + struct pnfs_deviceid_cache *local = clp->cl_devid_cache; + + dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); + if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) { + int i; + /* Verify cache is empty */ + for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) + BUG_ON(!hlist_empty(&local->dc_deviceids[i])); + clp->cl_devid_cache = NULL; + spin_unlock(&clp->cl_lock); + kfree(local); + } +} +EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 1c3eb02f4944..cbba28cb02a7 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -32,7 +32,7 @@ struct pnfs_layout_segment { struct list_head fi_list; - u32 iomode; + struct pnfs_layout_range range; struct kref kref; struct pnfs_layout_hdr *layout; }; @@ -44,6 +44,7 @@ struct pnfs_layout_segment { enum { NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ + NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */ }; /* Per-layout driver specific registration structure */ @@ -54,26 +55,96 @@ struct pnfs_layoutdriver_type { struct module *owner; int (*initialize_mountpoint) (struct nfs_server *); int (*uninitialize_mountpoint) (struct nfs_server *); + struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); + void (*free_lseg) (struct pnfs_layout_segment *lseg); }; struct pnfs_layout_hdr { unsigned long refcount; struct list_head layouts; /* other client layouts */ struct list_head segs; /* layout segments list */ + seqlock_t seqlock; /* Protects the stateid */ + nfs4_stateid stateid; unsigned long state; struct inode *inode; }; +struct pnfs_device { + struct nfs4_deviceid dev_id; + unsigned int layout_type; + unsigned int mincount; + struct page **pages; + void *area; + unsigned int pgbase; + unsigned int pglen; +}; + +/* + * Device ID RCU cache. A device ID is unique per client ID and layout type. + */ +#define NFS4_DEVICE_ID_HASH_BITS 5 +#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) +#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) + +static inline u32 +nfs4_deviceid_hash(struct nfs4_deviceid *id) +{ + unsigned char *cptr = (unsigned char *)id->data; + unsigned int nbytes = NFS4_DEVICEID4_SIZE; + u32 x = 0; + + while (nbytes--) { + x *= 37; + x += *cptr++; + } + return x & NFS4_DEVICE_ID_HASH_MASK; +} + +struct pnfs_deviceid_node { + struct hlist_node de_node; + struct nfs4_deviceid de_id; + atomic_t de_ref; +}; + +struct pnfs_deviceid_cache { + spinlock_t dc_lock; + atomic_t dc_ref; + void (*dc_free_callback)(struct pnfs_deviceid_node *); + struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; +}; + +extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *, + void (*free_callback)(struct pnfs_deviceid_node *)); +extern void pnfs_put_deviceid_cache(struct nfs_client *); +extern struct pnfs_deviceid_node *pnfs_find_get_deviceid( + struct pnfs_deviceid_cache *, + struct nfs4_deviceid *); +extern struct pnfs_deviceid_node *pnfs_add_deviceid( + struct pnfs_deviceid_cache *, + struct pnfs_deviceid_node *); +extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c, + struct pnfs_deviceid_node *devid); + extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); +/* nfs4proc.c */ +extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *dev); +extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); + +/* pnfs.c */ struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, enum pnfs_iomode access_type); void set_pnfs_layoutdriver(struct nfs_server *, u32 id); void unset_pnfs_layoutdriver(struct nfs_server *); +int pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_destroy_layout(struct nfs_inode *); void pnfs_destroy_all_layouts(struct nfs_client *); +void put_layout_hdr(struct inode *inode); +void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, + struct nfs4_state *open_state); static inline int lo_fail_bit(u32 iomode) diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 34da32436ac0..a9683d6acaa4 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -545,6 +545,8 @@ enum { NFSPROC4_CLNT_SEQUENCE, NFSPROC4_CLNT_GET_LEASE_TIME, NFSPROC4_CLNT_RECLAIM_COMPLETE, + NFSPROC4_CLNT_LAYOUTGET, + NFSPROC4_CLNT_GETDEVICEINFO, }; /* nfs41 types */ diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 4d62f1581ed1..452d96436d26 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -83,6 +83,7 @@ struct nfs_client { u32 cl_exchange_flags; struct nfs4_session *cl_session; /* sharred session */ struct list_head cl_layouts; + struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */ #endif /* CONFIG_NFS_V4_1 */ #ifdef CONFIG_NFS_FSCACHE diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 065f9d105d05..ba6cc8f223c9 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -187,6 +187,55 @@ struct nfs4_get_lease_time_res { struct nfs4_sequence_res lr_seq_res; }; +#define PNFS_LAYOUT_MAXSIZE 4096 + +struct nfs4_layoutdriver_data { + __u32 len; + void *buf; +}; + +struct pnfs_layout_range { + u32 iomode; + u64 offset; + u64 length; +}; + +struct nfs4_layoutget_args { + __u32 type; + struct pnfs_layout_range range; + __u64 minlength; + __u32 maxcount; + struct inode *inode; + struct nfs_open_context *ctx; + struct nfs4_sequence_args seq_args; +}; + +struct nfs4_layoutget_res { + __u32 return_on_close; + struct pnfs_layout_range range; + __u32 type; + nfs4_stateid stateid; + struct nfs4_layoutdriver_data layout; + struct nfs4_sequence_res seq_res; +}; + +struct nfs4_layoutget { + struct nfs4_layoutget_args args; + struct nfs4_layoutget_res res; + struct pnfs_layout_segment **lsegpp; + int status; +}; + +struct nfs4_getdeviceinfo_args { + struct pnfs_device *pdev; + struct nfs4_sequence_args seq_args; +}; + +struct nfs4_getdeviceinfo_res { + struct pnfs_device *pdev; + struct nfs4_sequence_res seq_res; +}; + /* * Arguments to the open call. */ From 16b374ca439fb406e46e071f75428f5b033056f8 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Wed, 20 Oct 2010 00:18:04 -0400 Subject: [PATCH 12/15] NFSv4.1: pnfs: filelayout: add driver's LAYOUTGET and GETDEVICEINFO infrastructure Implement the driver's io_ops->alloc_lseg and free_lseg functions, which integrate into the deviceid cache and calls out to nfs4_proc_getdeviceinfo when necessary. Signed-off-by: Andy Adamson Signed-off-by: Dean Hildebrand Signed-off-by: Marc Eshel Signed-off-by: Mike Sager Signed-off-by: Oleg Drokin Signed-off-by: Ricardo Labiaga Signed-off-by: Tao Guo Signed-off-by: Boaz Harrosh Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/Makefile | 2 +- fs/nfs/client.c | 1 + fs/nfs/nfs4filelayout.c | 204 ++++++++++++++++- fs/nfs/nfs4filelayout.h | 94 ++++++++ fs/nfs/nfs4filelayoutdev.c | 448 +++++++++++++++++++++++++++++++++++++ 5 files changed, 747 insertions(+), 2 deletions(-) create mode 100644 fs/nfs/nfs4filelayout.h create mode 100644 fs/nfs/nfs4filelayoutdev.c diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 08a8889c5149..4776ff9e3814 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -20,4 +20,4 @@ nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o -nfs_layout_nfsv41_files-y := nfs4filelayout.o +nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o diff --git a/fs/nfs/client.c b/fs/nfs/client.c index e55ad211dbed..8934b2e339df 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -255,6 +255,7 @@ void nfs_put_client(struct nfs_client *clp) nfs_free_client(clp); } } +EXPORT_SYMBOL_GPL(nfs_put_client); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) /* diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 696e283c2632..9e82b51cb515 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -30,7 +30,9 @@ */ #include -#include "pnfs.h" + +#include "internal.h" +#include "nfs4filelayout.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -41,23 +43,223 @@ MODULE_DESCRIPTION("The NFSv4 file layout driver"); int filelayout_initialize_mountpoint(struct nfs_server *nfss) { + int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client, + nfs4_fl_free_deviceid_callback); + if (status) { + printk(KERN_WARNING "%s: deviceid cache could not be " + "initialized\n", __func__); + return status; + } + dprintk("%s: deviceid cache has been initialized successfully\n", + __func__); return 0; } +/* Uninitialize a mountpoint by destroying its device list */ int filelayout_uninitialize_mountpoint(struct nfs_server *nfss) { dprintk("--> %s\n", __func__); + if (nfss->nfs_client->cl_devid_cache) + pnfs_put_deviceid_cache(nfss->nfs_client); return 0; } +/* + * filelayout_check_layout() + * + * Make sure layout segment parameters are sane WRT the device. + * At this point no generic layer initialization of the lseg has occurred, + * and nothing has been added to the layout_hdr cache. + * + */ +static int +filelayout_check_layout(struct pnfs_layout_hdr *lo, + struct nfs4_filelayout_segment *fl, + struct nfs4_layoutget_res *lgr, + struct nfs4_deviceid *id) +{ + struct nfs4_file_layout_dsaddr *dsaddr; + int status = -EINVAL; + struct nfs_server *nfss = NFS_SERVER(lo->inode); + + dprintk("--> %s\n", __func__); + + if (fl->pattern_offset > lgr->range.offset) { + dprintk("%s pattern_offset %lld to large\n", + __func__, fl->pattern_offset); + goto out; + } + + if (fl->stripe_unit % PAGE_SIZE) { + dprintk("%s Stripe unit (%u) not page aligned\n", + __func__, fl->stripe_unit); + goto out; + } + + /* find and reference the deviceid */ + dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id); + if (dsaddr == NULL) { + dsaddr = get_device_info(lo->inode, id); + if (dsaddr == NULL) + goto out; + } + fl->dsaddr = dsaddr; + + if (fl->first_stripe_index < 0 || + fl->first_stripe_index >= dsaddr->stripe_count) { + dprintk("%s Bad first_stripe_index %d\n", + __func__, fl->first_stripe_index); + goto out_put; + } + + if ((fl->stripe_type == STRIPE_SPARSE && + fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) || + (fl->stripe_type == STRIPE_DENSE && + fl->num_fh != dsaddr->stripe_count)) { + dprintk("%s num_fh %u not valid for given packing\n", + __func__, fl->num_fh); + goto out_put; + } + + if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { + dprintk("%s Stripe unit (%u) not aligned with rsize %u " + "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, + nfss->wsize); + } + + status = 0; +out: + dprintk("--> %s returns %d\n", __func__, status); + return status; +out_put: + pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid); + goto out; +} + +static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) +{ + int i; + + for (i = 0; i < fl->num_fh; i++) { + if (!fl->fh_array[i]) + break; + kfree(fl->fh_array[i]); + } + kfree(fl->fh_array); + fl->fh_array = NULL; +} + +static void +_filelayout_free_lseg(struct nfs4_filelayout_segment *fl) +{ + filelayout_free_fh_array(fl); + kfree(fl); +} + +static int +filelayout_decode_layout(struct pnfs_layout_hdr *flo, + struct nfs4_filelayout_segment *fl, + struct nfs4_layoutget_res *lgr, + struct nfs4_deviceid *id) +{ + uint32_t *p = (uint32_t *)lgr->layout.buf; + uint32_t nfl_util; + int i; + + dprintk("%s: set_layout_map Begin\n", __func__); + + memcpy(id, p, sizeof(*id)); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + print_deviceid(id); + + nfl_util = be32_to_cpup(p++); + if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) + fl->commit_through_mds = 1; + if (nfl_util & NFL4_UFLG_DENSE) + fl->stripe_type = STRIPE_DENSE; + else + fl->stripe_type = STRIPE_SPARSE; + fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; + + fl->first_stripe_index = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &fl->pattern_offset); + fl->num_fh = be32_to_cpup(p++); + + dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n", + __func__, nfl_util, fl->num_fh, fl->first_stripe_index, + fl->pattern_offset); + + fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), + GFP_KERNEL); + if (!fl->fh_array) + return -ENOMEM; + + for (i = 0; i < fl->num_fh; i++) { + /* Do we want to use a mempool here? */ + fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); + if (!fl->fh_array[i]) { + filelayout_free_fh_array(fl); + return -ENOMEM; + } + fl->fh_array[i]->size = be32_to_cpup(p++); + if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { + printk(KERN_ERR "Too big fh %d received %d\n", + i, fl->fh_array[i]->size); + filelayout_free_fh_array(fl); + return -EIO; + } + memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); + p += XDR_QUADLEN(fl->fh_array[i]->size); + dprintk("DEBUG: %s: fh len %d\n", __func__, + fl->fh_array[i]->size); + } + + return 0; +} + +static struct pnfs_layout_segment * +filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, + struct nfs4_layoutget_res *lgr) +{ + struct nfs4_filelayout_segment *fl; + int rc; + struct nfs4_deviceid id; + + dprintk("--> %s\n", __func__); + fl = kzalloc(sizeof(*fl), GFP_KERNEL); + if (!fl) + return NULL; + + rc = filelayout_decode_layout(layoutid, fl, lgr, &id); + if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) { + _filelayout_free_lseg(fl); + return NULL; + } + return &fl->generic_hdr; +} + +static void +filelayout_free_lseg(struct pnfs_layout_segment *lseg) +{ + struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode); + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + + dprintk("--> %s\n", __func__); + pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, + &fl->dsaddr->deviceid); + _filelayout_free_lseg(fl); +} + static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", .owner = THIS_MODULE, .initialize_mountpoint = filelayout_initialize_mountpoint, .uninitialize_mountpoint = filelayout_uninitialize_mountpoint, + .alloc_lseg = filelayout_alloc_lseg, + .free_lseg = filelayout_free_lseg, }; static int __init nfs4filelayout_init(void) diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h new file mode 100644 index 000000000000..bbf60dd2ab9d --- /dev/null +++ b/fs/nfs/nfs4filelayout.h @@ -0,0 +1,94 @@ +/* + * NFSv4 file layout driver data structures. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#ifndef FS_NFS_NFS4FILELAYOUT_H +#define FS_NFS_NFS4FILELAYOUT_H + +#include "pnfs.h" + +/* + * Field testing shows we need to support upto 4096 stripe indices. + * We store each index as a u8 (u32 on the wire) to keep the memory footprint + * reasonable. This in turn means we support a maximum of 256 + * RFC 5661 multipath_list4 structures. + */ +#define NFS4_PNFS_MAX_STRIPE_CNT 4096 +#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */ + +enum stripetype4 { + STRIPE_SPARSE = 1, + STRIPE_DENSE = 2 +}; + +/* Individual ip address */ +struct nfs4_pnfs_ds { + struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ + u32 ds_ip_addr; + u32 ds_port; + struct nfs_client *ds_clp; + atomic_t ds_count; +}; + +struct nfs4_file_layout_dsaddr { + struct pnfs_deviceid_node deviceid; + u32 stripe_count; + u8 *stripe_indices; + u32 ds_num; + struct nfs4_pnfs_ds *ds_list[1]; +}; + +struct nfs4_filelayout_segment { + struct pnfs_layout_segment generic_hdr; + u32 stripe_type; + u32 commit_through_mds; + u32 stripe_unit; + u32 first_stripe_index; + u64 pattern_offset; + struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ + unsigned int num_fh; + struct nfs_fh **fh_array; +}; + +static inline struct nfs4_filelayout_segment * +FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg) +{ + return container_of(lseg, + struct nfs4_filelayout_segment, + generic_hdr); +} + +extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *); +extern void print_ds(struct nfs4_pnfs_ds *ds); +extern void print_deviceid(struct nfs4_deviceid *dev_id); +extern struct nfs4_file_layout_dsaddr * +nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id); +struct nfs4_file_layout_dsaddr * +get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id); + +#endif /* FS_NFS_NFS4FILELAYOUT_H */ diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c new file mode 100644 index 000000000000..51fe64ace55a --- /dev/null +++ b/fs/nfs/nfs4filelayoutdev.c @@ -0,0 +1,448 @@ +/* + * Device operations for the pnfs nfs4 file layout driver. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * Garth Goodson + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include +#include + +#include "internal.h" +#include "nfs4filelayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +/* + * Data server cache + * + * Data servers can be mapped to different device ids. + * nfs4_pnfs_ds reference counting + * - set to 1 on allocation + * - incremented when a device id maps a data server already in the cache. + * - decremented when deviceid is removed from the cache. + */ +DEFINE_SPINLOCK(nfs4_ds_cache_lock); +static LIST_HEAD(nfs4_data_server_cache); + +/* Debug routines */ +void +print_ds(struct nfs4_pnfs_ds *ds) +{ + if (ds == NULL) { + printk("%s NULL device\n", __func__); + return; + } + printk(" ip_addr %x port %hu\n" + " ref count %d\n" + " client %p\n" + " cl_exchange_flags %x\n", + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), + atomic_read(&ds->ds_count), ds->ds_clp, + ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); +} + +void +print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr) +{ + int i; + + ifdebug(FACILITY) { + printk("%s dsaddr->ds_num %d\n", __func__, + dsaddr->ds_num); + for (i = 0; i < dsaddr->ds_num; i++) + print_ds(dsaddr->ds_list[i]); + } +} + +void print_deviceid(struct nfs4_deviceid *id) +{ + u32 *p = (u32 *)id; + + dprintk("%s: device id= [%x%x%x%x]\n", __func__, + p[0], p[1], p[2], p[3]); +} + +/* nfs4_ds_cache_lock is held */ +static struct nfs4_pnfs_ds * +_data_server_lookup_locked(u32 ip_addr, u32 port) +{ + struct nfs4_pnfs_ds *ds; + + dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", + ntohl(ip_addr), ntohs(port)); + + list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { + if (ds->ds_ip_addr == ip_addr && + ds->ds_port == port) { + return ds; + } + } + return NULL; +} + +static void +destroy_ds(struct nfs4_pnfs_ds *ds) +{ + dprintk("--> %s\n", __func__); + ifdebug(FACILITY) + print_ds(ds); + + if (ds->ds_clp) + nfs_put_client(ds->ds_clp); + kfree(ds); +} + +static void +nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) +{ + struct nfs4_pnfs_ds *ds; + int i; + + print_deviceid(&dsaddr->deviceid.de_id); + + for (i = 0; i < dsaddr->ds_num; i++) { + ds = dsaddr->ds_list[i]; + if (ds != NULL) { + if (atomic_dec_and_lock(&ds->ds_count, + &nfs4_ds_cache_lock)) { + list_del_init(&ds->ds_node); + spin_unlock(&nfs4_ds_cache_lock); + destroy_ds(ds); + } + } + } + kfree(dsaddr->stripe_indices); + kfree(dsaddr); +} + +void +nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device) +{ + struct nfs4_file_layout_dsaddr *dsaddr = + container_of(device, struct nfs4_file_layout_dsaddr, deviceid); + + nfs4_fl_free_deviceid(dsaddr); +} + +static struct nfs4_pnfs_ds * +nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port) +{ + struct nfs4_pnfs_ds *tmp_ds, *ds; + + ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL); + if (!ds) + goto out; + + spin_lock(&nfs4_ds_cache_lock); + tmp_ds = _data_server_lookup_locked(ip_addr, port); + if (tmp_ds == NULL) { + ds->ds_ip_addr = ip_addr; + ds->ds_port = port; + atomic_set(&ds->ds_count, 1); + INIT_LIST_HEAD(&ds->ds_node); + ds->ds_clp = NULL; + list_add(&ds->ds_node, &nfs4_data_server_cache); + dprintk("%s add new data server ip 0x%x\n", __func__, + ds->ds_ip_addr); + } else { + kfree(ds); + atomic_inc(&tmp_ds->ds_count); + dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", + __func__, tmp_ds->ds_ip_addr, + atomic_read(&tmp_ds->ds_count)); + ds = tmp_ds; + } + spin_unlock(&nfs4_ds_cache_lock); +out: + return ds; +} + +/* + * Currently only support ipv4, and one multi-path address. + */ +static struct nfs4_pnfs_ds * +decode_and_add_ds(__be32 **pp, struct inode *inode) +{ + struct nfs4_pnfs_ds *ds = NULL; + char *buf; + const char *ipend, *pstr; + u32 ip_addr, port; + int nlen, rlen, i; + int tmp[2]; + __be32 *r_netid, *r_addr, *p = *pp; + + /* r_netid */ + nlen = be32_to_cpup(p++); + r_netid = p; + p += XDR_QUADLEN(nlen); + + /* r_addr */ + rlen = be32_to_cpup(p++); + r_addr = p; + p += XDR_QUADLEN(rlen); + *pp = p; + + /* Check that netid is "tcp" */ + if (nlen != 3 || memcmp((char *)r_netid, "tcp", 3)) { + dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__); + goto out_err; + } + + /* ipv6 length plus port is legal */ + if (rlen > INET6_ADDRSTRLEN + 8) { + dprintk("%s Invalid address, length %d\n", __func__, + rlen); + goto out_err; + } + buf = kmalloc(rlen + 1, GFP_KERNEL); + buf[rlen] = '\0'; + memcpy(buf, r_addr, rlen); + + /* replace the port dots with dashes for the in4_pton() delimiter*/ + for (i = 0; i < 2; i++) { + char *res = strrchr(buf, '.'); + *res = '-'; + } + + /* Currently only support ipv4 address */ + if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) { + dprintk("%s: Only ipv4 addresses supported\n", __func__); + goto out_free; + } + + /* port */ + pstr = ipend; + sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]); + port = htons((tmp[0] << 8) | (tmp[1])); + + ds = nfs4_pnfs_ds_add(inode, ip_addr, port); + dprintk("%s Decoded address and port %s\n", __func__, buf); +out_free: + kfree(buf); +out_err: + return ds; +} + +/* Decode opaque device data and return the result */ +static struct nfs4_file_layout_dsaddr* +decode_device(struct inode *ino, struct pnfs_device *pdev) +{ + int i, dummy; + u32 cnt, num; + u8 *indexp; + __be32 *p = (__be32 *)pdev->area, *indicesp; + struct nfs4_file_layout_dsaddr *dsaddr; + + /* Get the stripe count (number of stripe index) */ + cnt = be32_to_cpup(p++); + dprintk("%s stripe count %d\n", __func__, cnt); + if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { + printk(KERN_WARNING "%s: stripe count %d greater than " + "supported maximum %d\n", __func__, + cnt, NFS4_PNFS_MAX_STRIPE_CNT); + goto out_err; + } + + /* Check the multipath list count */ + indicesp = p; + p += XDR_QUADLEN(cnt << 2); + num = be32_to_cpup(p++); + dprintk("%s ds_num %u\n", __func__, num); + if (num > NFS4_PNFS_MAX_MULTI_CNT) { + printk(KERN_WARNING "%s: multipath count %d greater than " + "supported maximum %d\n", __func__, + num, NFS4_PNFS_MAX_MULTI_CNT); + goto out_err; + } + dsaddr = kzalloc(sizeof(*dsaddr) + + (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), + GFP_KERNEL); + if (!dsaddr) + goto out_err; + + dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL); + if (!dsaddr->stripe_indices) + goto out_err_free; + + dsaddr->stripe_count = cnt; + dsaddr->ds_num = num; + + memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id)); + + /* Go back an read stripe indices */ + p = indicesp; + indexp = &dsaddr->stripe_indices[0]; + for (i = 0; i < dsaddr->stripe_count; i++) { + *indexp = be32_to_cpup(p++); + if (*indexp >= num) + goto out_err_free; + indexp++; + } + /* Skip already read multipath list count */ + p++; + + for (i = 0; i < dsaddr->ds_num; i++) { + int j; + + dummy = be32_to_cpup(p++); /* multipath count */ + if (dummy > 1) { + printk(KERN_WARNING + "%s: Multipath count %d not supported, " + "skipping all greater than 1\n", __func__, + dummy); + } + for (j = 0; j < dummy; j++) { + if (j == 0) { + dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); + if (dsaddr->ds_list[i] == NULL) + goto out_err_free; + } else { + u32 len; + /* skip extra multipath */ + len = be32_to_cpup(p++); + p += XDR_QUADLEN(len); + len = be32_to_cpup(p++); + p += XDR_QUADLEN(len); + continue; + } + } + } + return dsaddr; + +out_err_free: + nfs4_fl_free_deviceid(dsaddr); +out_err: + dprintk("%s ERROR: returning NULL\n", __func__); + return NULL; +} + +/* + * Decode the opaque device specified in 'dev' + * and add it to the list of available devices. + * If the deviceid is already cached, nfs4_add_deviceid will return + * a pointer to the cached struct and throw away the new. + */ +static struct nfs4_file_layout_dsaddr* +decode_and_add_device(struct inode *inode, struct pnfs_device *dev) +{ + struct nfs4_file_layout_dsaddr *dsaddr; + struct pnfs_deviceid_node *d; + + dsaddr = decode_device(inode, dev); + if (!dsaddr) { + printk(KERN_WARNING "%s: Could not decode or add device\n", + __func__); + return NULL; + } + + d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, + &dsaddr->deviceid); + + return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); +} + +/* + * Retrieve the information for dev_id, add it to the list + * of available devices, and return it. + */ +struct nfs4_file_layout_dsaddr * +get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id) +{ + struct pnfs_device *pdev = NULL; + u32 max_resp_sz; + int max_pages; + struct page **pages = NULL; + struct nfs4_file_layout_dsaddr *dsaddr = NULL; + int rc, i; + struct nfs_server *server = NFS_SERVER(inode); + + /* + * Use the session max response size as the basis for setting + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + max_pages = max_resp_sz >> PAGE_SHIFT; + dprintk("%s inode %p max_resp_sz %u max_pages %d\n", + __func__, inode, max_resp_sz, max_pages); + + pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL); + if (pdev == NULL) + return NULL; + + pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); + if (pages == NULL) { + kfree(pdev); + return NULL; + } + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto out_free; + } + + /* set pdev->area */ + pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); + if (!pdev->area) + goto out_free; + + memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); + pdev->layout_type = LAYOUT_NFSV4_1_FILES; + pdev->pages = pages; + pdev->pgbase = 0; + pdev->pglen = PAGE_SIZE * max_pages; + pdev->mincount = 0; + + rc = nfs4_proc_getdeviceinfo(server, pdev); + dprintk("%s getdevice info returns %d\n", __func__, rc); + if (rc) + goto out_free; + + /* + * Found new device, need to decode it and then add it to the + * list of known devices for this mountpoint. + */ + dsaddr = decode_and_add_device(inode, pdev); +out_free: + if (pdev->area != NULL) + vunmap(pdev->area); + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); + kfree(pdev); + dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); + return dsaddr; +} + +struct nfs4_file_layout_dsaddr * +nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id) +{ + struct pnfs_deviceid_node *d; + + d = pnfs_find_get_deviceid(clp->cl_devid_cache, id); + return (d == NULL) ? NULL : + container_of(d, struct nfs4_file_layout_dsaddr, deviceid); +} From 1c787096fce217b5fdd9806dbce96e738c9345c0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 21 Oct 2010 16:56:48 -0400 Subject: [PATCH 13/15] NFSv4.1: Use more sensible names for 'initialize_mountpoint' The initialize_mountpoint/uninitialise_mountpoint functions are really about setting or clearing the layout driver to be used on this filesystem. Change the names to the more descriptive 'set_layoutdriver/clear_layoutdriver'. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 14 +++++++------- fs/nfs/pnfs.c | 4 ++-- fs/nfs/pnfs.h | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 9e82b51cb515..2e92f0d8d654 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -40,8 +40,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dean Hildebrand "); MODULE_DESCRIPTION("The NFSv4 file layout driver"); -int -filelayout_initialize_mountpoint(struct nfs_server *nfss) +static int +filelayout_set_layoutdriver(struct nfs_server *nfss) { int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client, nfs4_fl_free_deviceid_callback); @@ -55,9 +55,9 @@ filelayout_initialize_mountpoint(struct nfs_server *nfss) return 0; } -/* Uninitialize a mountpoint by destroying its device list */ -int -filelayout_uninitialize_mountpoint(struct nfs_server *nfss) +/* Clear out the layout by destroying its device list */ +static int +filelayout_clear_layoutdriver(struct nfs_server *nfss) { dprintk("--> %s\n", __func__); @@ -256,8 +256,8 @@ static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", .owner = THIS_MODULE, - .initialize_mountpoint = filelayout_initialize_mountpoint, - .uninitialize_mountpoint = filelayout_uninitialize_mountpoint, + .set_layoutdriver = filelayout_set_layoutdriver, + .clear_layoutdriver = filelayout_clear_layoutdriver, .alloc_lseg = filelayout_alloc_lseg, .free_lseg = filelayout_free_lseg, }; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d1ad7df3479e..db773428f95f 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -75,7 +75,7 @@ void unset_pnfs_layoutdriver(struct nfs_server *nfss) { if (nfss->pnfs_curr_ld) { - nfss->pnfs_curr_ld->uninitialize_mountpoint(nfss); + nfss->pnfs_curr_ld->clear_layoutdriver(nfss); module_put(nfss->pnfs_curr_ld->owner); } nfss->pnfs_curr_ld = NULL; @@ -115,7 +115,7 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id) goto out_no_driver; } server->pnfs_curr_ld = ld_type; - if (ld_type->initialize_mountpoint(server)) { + if (ld_type->set_layoutdriver(server)) { printk(KERN_ERR "%s: Error initializing mount point for layout driver %u.\n", __func__, id); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index cbba28cb02a7..e12367d50489 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -53,8 +53,8 @@ struct pnfs_layoutdriver_type { const u32 id; const char *name; struct module *owner; - int (*initialize_mountpoint) (struct nfs_server *); - int (*uninitialize_mountpoint) (struct nfs_server *); + int (*set_layoutdriver) (struct nfs_server *); + int (*clear_layoutdriver) (struct nfs_server *); struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); void (*free_lseg) (struct pnfs_layout_segment *lseg); }; From 43c2e885be25311e6289c7da52e8a03c4453ee03 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 2 Oct 2010 15:19:01 -0400 Subject: [PATCH 14/15] nfs4: fix channel attribute sanity-checks The sanity checks here are incorrect; in the worst case they allow values that crash the client. They're also over-reliant on the preprocessor. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 75 ++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 34 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7e14e991ddfa..32c8758c99fd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4842,49 +4842,56 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) args->bc_attrs.max_reqs); } -static int _verify_channel_attr(char *chan, char *attr_name, u32 sent, u32 rcvd) +static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) { - if (rcvd <= sent) - return 0; - printk(KERN_WARNING "%s: Session INVALID: %s channel %s increased. " - "sent=%u rcvd=%u\n", __func__, chan, attr_name, sent, rcvd); - return -EINVAL; + struct nfs4_channel_attrs *sent = &args->fc_attrs; + struct nfs4_channel_attrs *rcvd = &session->fc_attrs; + + if (rcvd->headerpadsz > sent->headerpadsz) + return -EINVAL; + if (rcvd->max_resp_sz > sent->max_resp_sz) + return -EINVAL; + /* + * Our requested max_ops is the minimum we need; we're not + * prepared to break up compounds into smaller pieces than that. + * So, no point even trying to continue if the server won't + * cooperate: + */ + if (rcvd->max_ops < sent->max_ops) + return -EINVAL; + if (rcvd->max_reqs == 0) + return -EINVAL; + return 0; } -#define _verify_fore_channel_attr(_name_) \ - _verify_channel_attr("fore", #_name_, \ - args->fc_attrs._name_, \ - session->fc_attrs._name_) +static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) +{ + struct nfs4_channel_attrs *sent = &args->bc_attrs; + struct nfs4_channel_attrs *rcvd = &session->bc_attrs; -#define _verify_back_channel_attr(_name_) \ - _verify_channel_attr("back", #_name_, \ - args->bc_attrs._name_, \ - session->bc_attrs._name_) + if (rcvd->max_rqst_sz > sent->max_rqst_sz) + return -EINVAL; + if (rcvd->max_resp_sz < sent->max_resp_sz) + return -EINVAL; + if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) + return -EINVAL; + /* These would render the backchannel useless: */ + if (rcvd->max_ops == 0) + return -EINVAL; + if (rcvd->max_reqs == 0) + return -EINVAL; + return 0; +} -/* - * The server is not allowed to increase the fore channel header pad size, - * maximum response size, or maximum number of operations. - * - * The back channel attributes are only negotiatied down: We send what the - * (back channel) server insists upon. - */ static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) { - int ret = 0; + int ret; - ret |= _verify_fore_channel_attr(headerpadsz); - ret |= _verify_fore_channel_attr(max_resp_sz); - ret |= _verify_fore_channel_attr(max_ops); - - ret |= _verify_back_channel_attr(headerpadsz); - ret |= _verify_back_channel_attr(max_rqst_sz); - ret |= _verify_back_channel_attr(max_resp_sz); - ret |= _verify_back_channel_attr(max_resp_sz_cached); - ret |= _verify_back_channel_attr(max_ops); - ret |= _verify_back_channel_attr(max_reqs); - - return ret; + ret = nfs4_verify_fore_channel_attrs(args, session); + if (ret) + return ret; + return nfs4_verify_back_channel_attrs(args, session); } static int _nfs4_proc_create_session(struct nfs_client *clp) From 411b5e05617593efebc06241dbc56f42150f2abe Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 13 Sep 2010 12:48:01 -0700 Subject: [PATCH 15/15] net/sunrpc: Use static const char arrays Signed-off-by: Joe Perches Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/gss_krb5_mech.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 032644610524..8a4d083c9b30 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -422,7 +422,7 @@ static int context_derive_keys_rc4(struct krb5_ctx *ctx) { struct crypto_hash *hmac; - char sigkeyconstant[] = "signaturekey"; + static const char sigkeyconstant[] = "signaturekey"; int slen = strlen(sigkeyconstant) + 1; /* include null terminator */ struct hash_desc desc; struct scatterlist sg[1];