The big ticket item here is support for rbd exclusive-lock feature,

with maintenance operations offloaded to userspace (Douglas Fuller, Mike Christie and myself). Another block device bullet is a series fixing up layering error paths (myself). On the filesystem side, we've got patches that improve our handling of buffered vs dio write races (Neil Brown) and a few assorted fixes from Zheng. Also included a couple of random cleanups and a minor CRUSH update. -----BEGIN PGP SIGNATURE----- Version: GnuPG v2 iQEcBAABCAAGBQJX+PjZAAoJEEp/3jgCEfOLVuoH/RwtFLIb6/KZUYtBOrVVrTun kReRlfq2xKYrGGtyQEqSuz7fBdwT1LVCVcL8kC4GFD4R67o+tNMAr6PfM/7pZABj HRoRLgSZ9FLw4W5n0VpBIznih75QUbCdXiTCtH9eorMHU5q1YpTvVHHlF9W9Pm2I eNGnBWpGyHVeiK66mpUCH+EQKQ4GkAVD9rneTNqLHgq2yotHkVl1j258+DL6JRGs OBoh3RmNQaGOAS37Lss8erCSusAGEcAeGV6ubuK2lFUKyR41EkD3I0xkhNSPe+CD RifFcpVziIeTu//cLgl0nnHGtmUytD7HgJubaPthArKIOen9ZDAfEkgI0o+JI2A= =45O7 -----END PGP SIGNATURE----- Merge tag 'ceph-for-4.9-rc1' of git://github.com/ceph/ceph-client Pull Ceph updates from Ilya Dryomov: "The big ticket item here is support for rbd exclusive-lock feature, with maintenance operations offloaded to userspace (Douglas Fuller, Mike Christie and myself). Another block device bullet is a series fixing up layering error paths (myself). On the filesystem side, we've got patches that improve our handling of buffered vs dio write races (Neil Brown) and a few assorted fixes from Zheng. Also included a couple of random cleanups and a minor CRUSH update" * tag 'ceph-for-4.9-rc1' of git://github.com/ceph/ceph-client: (39 commits) crush: remove redundant local variable crush: don't normalize input of crush_ln iteratively libceph: ceph_build_auth() doesn't need ceph_auth_build_hello() libceph: use CEPH_AUTH_UNKNOWN in ceph_auth_build_hello() ceph: fix description for rsize and rasize mount options rbd: use kmalloc_array() in rbd_header_from_disk() ceph: use list_move instead of list_del/list_add ceph: handle CEPH_SESSION_REJECT message ceph: avoid accessing / when mounting a subpath ceph: fix mandatory flock check ceph: remove warning when ceph_releasepage() is called on dirty page ceph: ignore error from invalidate_inode_pages2_range() in direct write ceph: fix error handling of start_read() rbd: add rbd_obj_request_error() helper rbd: img_data requests don't own their page array rbd: don't call rbd_osd_req_format_read() for !img_data requests rbd: rework rbd_img_obj_exists_submit() error paths rbd: don't crash or leak on errors in rbd_img_obj_parent_read_full_callback() rbd: move bumping img_request refcount into rbd_obj_request_submit() rbd: mark the original request as done if stat request fails ...
2016-10-10 13:52:05 -07:00 · 2016-10-10 13:52:05 -07:00 · 8dfb790b15
--- a/Documentation/ABI/testing/sysfs-bus-rbd
+++ b/Documentation/ABI/testing/sysfs-bus-rbd
@ -6,7 +6,7 @@ Description:

 Being used for adding and removing rbd block devices.

-Usage: <mon ip addr> <options> <pool name> <rbd image name> [snap name]
+Usage: <mon ip addr> <options> <pool name> <rbd image name> [<snap name>]

 $ echo "192.168.0.1 name=admin rbd foo" > /sys/bus/rbd/add

@ -14,9 +14,13 @@ The snapshot name can be "-" or omitted to map the image read/write. A <dev-id>
 will be assigned for any registered block device. If snapshot is used, it will
 be mapped read-only.

-Removal of a device:
+Usage: <dev-id> [force]

-  $ echo <dev-id> > /sys/bus/rbd/remove
+ $ echo 2 > /sys/bus/rbd/remove
+
+Optional "force" argument which when passed will wait for running requests and
+then unmap the image. Requests sent to the driver after initiating the removal
+will be failed.  (August 2016, since 4.9.)

 What:		/sys/bus/rbd/add_single_major
 Date:		December 2013
@ -43,10 +47,25 @@ Description:	Available only if rbd module is inserted with single_major
 Entries under /sys/bus/rbd/devices/<dev-id>/
 --------------------------------------------

+client_addr
+
+	The ceph unique client entity_addr_t (address + nonce).
+	The format is <address>:<port>/<nonce>: '1.2.3.4:1234/5678' or
+	'[1:2:3:4:5:6:7:8]:1234/5678'.  (August 2016, since 4.9.)
+
 client_id

 	The ceph unique client id that was assigned for this specific session.

+cluster_fsid
+
+	The ceph cluster UUID.  (August 2016, since 4.9.)
+
+config_info
+
+	The string written into /sys/bus/rbd/add{,_single_major}.  (August
+	2016, since 4.9.)
+
 features

 	A hexadecimal encoding of the feature bits for this image.
@ -92,6 +111,10 @@ current_snap

 	The current snapshot for which the device is mapped.

+snap_id
+
+	The current snapshot's id.  (August 2016, since 4.9.)
+
 parent

 	Information identifying the chain of parent images in a layered rbd
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@ -98,6 +98,10 @@ Mount Options
 	size.

  rsize=X
+	Specify the maximum read size in bytes.  By default there is no
+	maximum.
+
+  rasize=X
 	Specify the maximum readahead.

  mount_timeout=X
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@ -28,6 +28,17 @@
 #define RBD_DATA_PREFIX        "rbd_data."
 #define RBD_ID_PREFIX          "rbd_id."

+#define RBD_LOCK_NAME          "rbd_lock"
+#define RBD_LOCK_TAG           "internal"
+#define RBD_LOCK_COOKIE_PREFIX "auto"
+
+enum rbd_notify_op {
+	RBD_NOTIFY_OP_ACQUIRED_LOCK      = 0,
+	RBD_NOTIFY_OP_RELEASED_LOCK      = 1,
+	RBD_NOTIFY_OP_REQUEST_LOCK       = 2,
+	RBD_NOTIFY_OP_HEADER_UPDATE      = 3,
+};
+
 /*
 * For format version 1, rbd image 'foo' consists of objects
 *   foo.rbd		- image metadata
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@ -175,9 +175,8 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,

 static int ceph_releasepage(struct page *page, gfp_t g)
 {
-	dout("%p releasepage %p idx %lu\n", page->mapping->host,
-	     page, page->index);
-	WARN_ON(PageDirty(page));
+	dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host,
+	     page, page->index, PageDirty(page) ? "" : "not ");

 	/* Can we release the page from the cache? */
 	if (!ceph_release_fscache_page(page, g))
@ -298,14 +297,6 @@ unlock:
 	kfree(osd_data->pages);
 }

-static void ceph_unlock_page_vector(struct page **pages, int num_pages)
-{
-	int i;
-
-	for (i = 0; i < num_pages; i++)
-		unlock_page(pages[i]);
-}
-
 /*
 * start an async read(ahead) operation.  return nr_pages we submitted
 * a read for on success, or negative error code.
@ -370,6 +361,10 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 			dout("start_read %p add_to_page_cache failed %p\n",
 			     inode, page);
 			nr_pages = i;
+			if (nr_pages > 0) {
+				len = nr_pages << PAGE_SHIFT;
+				break;
+			}
 			goto out_pages;
 		}
 		pages[i] = page;
@ -386,8 +381,11 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 	return nr_pages;

 out_pages:
-	ceph_unlock_page_vector(pages, nr_pages);
-	ceph_release_page_vector(pages, nr_pages);
+	for (i = 0; i < nr_pages; ++i) {
+		ceph_fscache_readpage_cancel(inode, pages[i]);
+		unlock_page(pages[i]);
+	}
+	ceph_put_page_vector(pages, nr_pages, false);
 out:
 	ceph_osdc_put_request(req);
 	return ret;
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@ -902,10 +902,10 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 		return ret;

 	if (write) {
-		ret = invalidate_inode_pages2_range(inode->i_mapping,
+		int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
 					pos >> PAGE_SHIFT,
 					(pos + count) >> PAGE_SHIFT);
-		if (ret < 0)
+		if (ret2 < 0)
 			dout("invalidate_inode_pages2_range returned %d\n", ret);

 		flags = CEPH_OSD_FLAG_ORDERSNAP |
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@ -210,8 +210,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
 	/* No mandatory locks */
-	if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
-		return -ENOLCK;
+	if (fl->fl_type & LOCK_MAND)
+		return -EOPNOTSUPP;

 	dout("ceph_flock, fl_file: %p", fl->fl_file);

--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@ -370,6 +370,7 @@ const char *ceph_session_state_name(int s)
 	case CEPH_MDS_SESSION_CLOSING: return "closing";
 	case CEPH_MDS_SESSION_RESTARTING: return "restarting";
 	case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
+	case CEPH_MDS_SESSION_REJECTED: return "rejected";
 	default: return "???";
 	}
 }
@ -1150,8 +1151,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		while (!list_empty(&ci->i_cap_flush_list)) {
 			cf = list_first_entry(&ci->i_cap_flush_list,
 					      struct ceph_cap_flush, i_list);
-			list_del(&cf->i_list);
-			list_add(&cf->i_list, &to_remove);
+			list_move(&cf->i_list, &to_remove);
 		}

 		spin_lock(&mdsc->cap_dirty_lock);
@ -1378,7 +1378,7 @@ static int request_close_session(struct ceph_mds_client *mdsc,
 	if (!msg)
 		return -ENOMEM;
 	ceph_con_send(&session->s_con, msg);
-	return 0;
+	return 1;
 }

 /*
@ -2131,6 +2131,10 @@ static int __do_request(struct ceph_mds_client *mdsc,
 	     ceph_session_state_name(session->s_state));
 	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
 	    session->s_state != CEPH_MDS_SESSION_HUNG) {
+		if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
+			err = -EACCES;
+			goto out_session;
+		}
 		if (session->s_state == CEPH_MDS_SESSION_NEW ||
 		    session->s_state == CEPH_MDS_SESSION_CLOSING)
 			__open_session(mdsc, session);
@ -2652,6 +2656,15 @@ static void handle_session(struct ceph_mds_session *session,
 		wake_up_session_caps(session, 0);
 		break;

+	case CEPH_SESSION_REJECT:
+		WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);
+		pr_info("mds%d rejected session\n", session->s_mds);
+		session->s_state = CEPH_MDS_SESSION_REJECTED;
+		cleanup_session_requests(mdsc, session);
+		remove_session_caps(session);
+		wake = 2; /* for good measure */
+		break;
+
 	default:
 		pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
 		WARN_ON(1);
@ -3557,11 +3570,11 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 /*
 * true if all sessions are closed, or we force unmount
 */
-static bool done_closing_sessions(struct ceph_mds_client *mdsc)
+static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
 {
 	if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
 		return true;
-	return atomic_read(&mdsc->num_sessions) == 0;
+	return atomic_read(&mdsc->num_sessions) <= skipped;
 }

 /*
@ -3572,6 +3585,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 	struct ceph_options *opts = mdsc->fsc->client->options;
 	struct ceph_mds_session *session;
 	int i;
+	int skipped = 0;

 	dout("close_sessions\n");

@ -3583,7 +3597,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 			continue;
 		mutex_unlock(&mdsc->mutex);
 		mutex_lock(&session->s_mutex);
-		__close_session(mdsc, session);
+		if (__close_session(mdsc, session) <= 0)
+			skipped++;
 		mutex_unlock(&session->s_mutex);
 		ceph_put_mds_session(session);
 		mutex_lock(&mdsc->mutex);
@ -3591,7 +3606,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 	mutex_unlock(&mdsc->mutex);

 	dout("waiting for sessions to close\n");
-	wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
+	wait_event_timeout(mdsc->session_close_wq,
+			   done_closing_sessions(mdsc, skipped),
 			   ceph_timeout_jiffies(opts->mount_timeout));

 	/* tear down remaining sessions */
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@ -121,6 +121,7 @@ enum {
 	CEPH_MDS_SESSION_CLOSING = 5,
 	CEPH_MDS_SESSION_RESTARTING = 6,
 	CEPH_MDS_SESSION_RECONNECTING = 7,
+	CEPH_MDS_SESSION_REJECTED = 8,
 };

 struct ceph_mds_session {
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@ -43,6 +43,8 @@ const char *ceph_session_op_name(int op)
 	case CEPH_SESSION_RECALL_STATE: return "recall_state";
 	case CEPH_SESSION_FLUSHMSG: return "flushmsg";
 	case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
+	case CEPH_SESSION_FORCE_RO: return "force_ro";
+	case CEPH_SESSION_REJECT: return "reject";
 	}
 	return "???";
 }
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@ -396,10 +396,12 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 	 */
 	dev_name_end = strchr(dev_name, '/');
 	if (dev_name_end) {
-		fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
-		if (!fsopt->server_path) {
-			err = -ENOMEM;
-			goto out;
+		if (strlen(dev_name_end) > 1) {
+			fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
+			if (!fsopt->server_path) {
+				err = -ENOMEM;
+				goto out;
+			}
 		}
 	} else {
 		dev_name_end = dev_name + strlen(dev_name);
@ -788,15 +790,10 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 		struct inode *inode = req->r_target_inode;
 		req->r_target_inode = NULL;
 		dout("open_root_inode success\n");
-		if (ceph_ino(inode) == CEPH_INO_ROOT &&
-		    fsc->sb->s_root == NULL) {
-			root = d_make_root(inode);
-			if (!root) {
-				root = ERR_PTR(-ENOMEM);
-				goto out;
-			}
-		} else {
-			root = d_obtain_root(inode);
+		root = d_make_root(inode);
+		if (!root) {
+			root = ERR_PTR(-ENOMEM);
+			goto out;
 		}
 		ceph_init_dentry(root);
 		dout("open_root_inode success, root dentry is %p\n", root);
@ -825,17 +822,24 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
 	mutex_lock(&fsc->client->mount_mutex);

 	if (!fsc->sb->s_root) {
+		const char *path;
 		err = __ceph_open_session(fsc->client, started);
 		if (err < 0)
 			goto out;

-		dout("mount opening root\n");
-		root = open_root_dentry(fsc, "", started);
+		if (!fsc->mount_options->server_path) {
+			path = "";
+			dout("mount opening path \\t\n");
+		} else {
+			path = fsc->mount_options->server_path + 1;
+			dout("mount opening path %s\n", path);
+		}
+		root = open_root_dentry(fsc, path, started);
 		if (IS_ERR(root)) {
 			err = PTR_ERR(root);
 			goto out;
 		}
-		fsc->sb->s_root = root;
+		fsc->sb->s_root = dget(root);
 		first = 1;

 		err = ceph_fs_debugfs_init(fsc);
@ -843,19 +847,6 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
 			goto fail;
 	}

-	if (!fsc->mount_options->server_path) {
-		root = fsc->sb->s_root;
-		dget(root);
-	} else {
-		const char *path = fsc->mount_options->server_path + 1;
-		dout("mount opening path %s\n", path);
-		root = open_root_dentry(fsc, path, started);
-		if (IS_ERR(root)) {
-			err = PTR_ERR(root);
-			goto fail;
-		}
-	}
-
 	fsc->mount_state = CEPH_MOUNT_MOUNTED;
 	dout("mount success\n");
 	mutex_unlock(&fsc->client->mount_mutex);
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@ -104,7 +104,7 @@ extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
 extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 				  void *buf, size_t len,
 				  void *reply_buf, size_t reply_len);
-extern int ceph_entity_name_encode(const char *name, void **p, void *end);
+int ceph_auth_entity_name_encode(const char *name, void **p, void *end);

 extern int ceph_build_auth(struct ceph_auth_client *ac,
 		    void *msg_buf, size_t msg_len);
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@ -138,6 +138,9 @@ struct ceph_dir_layout {
 #define CEPH_MSG_POOLOP_REPLY           48
 #define CEPH_MSG_POOLOP                 49

+/* mon commands */
+#define CEPH_MSG_MON_COMMAND            50
+#define CEPH_MSG_MON_COMMAND_ACK        51

 /* osd */
 #define CEPH_MSG_OSD_MAP                41
@ -176,6 +179,14 @@ struct ceph_mon_statfs_reply {
 	struct ceph_statfs st;
 } __attribute__ ((packed));

+struct ceph_mon_command {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 num_strs;         /* always 1 */
+	__le32 str_len;
+	char str[];
+} __attribute__ ((packed));
+
 struct ceph_osd_getmap {
 	struct ceph_mon_request_header monhdr;
 	struct ceph_fsid fsid;
@ -270,6 +281,7 @@ enum {
 	CEPH_SESSION_FLUSHMSG,
 	CEPH_SESSION_FLUSHMSG_ACK,
 	CEPH_SESSION_FORCE_RO,
+	CEPH_SESSION_REJECT,
 };

 extern const char *ceph_session_op_name(int op);
--- a/include/linux/ceph/cls_lock_client.h
+++ b/include/linux/ceph/cls_lock_client.h
@ -0,0 +1,49 @@
+#ifndef _LINUX_CEPH_CLS_LOCK_CLIENT_H
+#define _LINUX_CEPH_CLS_LOCK_CLIENT_H
+
+#include <linux/ceph/osd_client.h>
+
+enum ceph_cls_lock_type {
+	CEPH_CLS_LOCK_NONE = 0,
+	CEPH_CLS_LOCK_EXCLUSIVE = 1,
+	CEPH_CLS_LOCK_SHARED = 2,
+};
+
+struct ceph_locker_id {
+	struct ceph_entity_name name;	/* locker's client name */
+	char *cookie;			/* locker's cookie */
+};
+
+struct ceph_locker_info {
+	struct ceph_entity_addr addr;	/* locker's address */
+};
+
+struct ceph_locker {
+	struct ceph_locker_id id;
+	struct ceph_locker_info info;
+};
+
+int ceph_cls_lock(struct ceph_osd_client *osdc,
+		  struct ceph_object_id *oid,
+		  struct ceph_object_locator *oloc,
+		  char *lock_name, u8 type, char *cookie,
+		  char *tag, char *desc, u8 flags);
+int ceph_cls_unlock(struct ceph_osd_client *osdc,
+		    struct ceph_object_id *oid,
+		    struct ceph_object_locator *oloc,
+		    char *lock_name, char *cookie);
+int ceph_cls_break_lock(struct ceph_osd_client *osdc,
+			struct ceph_object_id *oid,
+			struct ceph_object_locator *oloc,
+			char *lock_name, char *cookie,
+			struct ceph_entity_name *locker);
+
+void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers);
+
+int ceph_cls_lock_info(struct ceph_osd_client *osdc,
+		       struct ceph_object_id *oid,
+		       struct ceph_object_locator *oloc,
+		       char *lock_name, u8 *type, char **tag,
+		       struct ceph_locker **lockers, u32 *num_lockers);
+
+#endif
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@ -264,7 +264,8 @@ extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
 					      void *private,
 					      u64 supported_features,
 					      u64 required_features);
-extern u64 ceph_client_id(struct ceph_client *client);
+struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client);
+u64 ceph_client_gid(struct ceph_client *client);
 extern void ceph_destroy_client(struct ceph_client *client);
 extern int __ceph_open_session(struct ceph_client *client,
 			       unsigned long started);
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@ -141,6 +141,9 @@ int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
 int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
 				ceph_monc_callback_t cb, u64 private_data);

+int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
+			    struct ceph_entity_addr *client_addr);
+
 extern int ceph_monc_open_session(struct ceph_mon_client *monc);

 extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@ -120,6 +120,9 @@ struct ceph_osd_req_op {
 			struct ceph_osd_data request_data;
 			struct ceph_osd_data response_data;
 		} notify;
+		struct {
+			struct ceph_osd_data response_data;
+		} list_watchers;
 		struct {
 			u64 expected_object_size;
 			u64 expected_write_size;
@ -249,6 +252,12 @@ struct ceph_osd_linger_request {
 	size_t *preply_len;
 };

+struct ceph_watch_item {
+	struct ceph_entity_name name;
+	u64 cookie;
+	struct ceph_entity_addr addr;
+};
+
 struct ceph_osd_client {
 	struct ceph_client     *client;

@ -346,7 +355,6 @@ extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *,
 					struct page **pages, u64 length,
 					u32 alignment, bool pages_from_pool,
 					bool own_pages);
-
 extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
 					unsigned int which, u16 opcode,
 					const char *class, const char *method);
@ -389,6 +397,14 @@ extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
 extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
 void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);

+int ceph_osdc_call(struct ceph_osd_client *osdc,
+		   struct ceph_object_id *oid,
+		   struct ceph_object_locator *oloc,
+		   const char *class, const char *method,
+		   unsigned int flags,
+		   struct page *req_page, size_t req_len,
+		   struct page *resp_page, size_t *resp_len);
+
 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 			       struct ceph_vino vino,
 			       struct ceph_file_layout *layout,
@ -434,5 +450,10 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc,
 		     size_t *preply_len);
 int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
 			  struct ceph_osd_linger_request *lreq);
+int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
+			    struct ceph_object_id *oid,
+			    struct ceph_object_locator *oloc,
+			    struct ceph_watch_item **watchers,
+			    u32 *num_watchers);
 #endif

--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@ -5,6 +5,7 @@ obj-$(CONFIG_CEPH_LIB) += libceph.o

 libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
 	mon_client.o \
+	cls_lock_client.o \
 	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
 	debugfs.o \
 	auth.o auth_none.o \
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@ -82,7 +82,10 @@ void ceph_auth_reset(struct ceph_auth_client *ac)
 	mutex_unlock(&ac->mutex);
 }

-int ceph_entity_name_encode(const char *name, void **p, void *end)
+/*
+ * EntityName, not to be confused with entity_name_t
+ */
+int ceph_auth_entity_name_encode(const char *name, void **p, void *end)
 {
 	int len = strlen(name);

@ -111,7 +114,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
 	monhdr->session_mon = cpu_to_le16(-1);
 	monhdr->session_mon_tid = 0;

-	ceph_encode_32(&p, 0);  /* no protocol, yet */
+	ceph_encode_32(&p, CEPH_AUTH_UNKNOWN);  /* no protocol, yet */

 	lenp = p;
 	p += sizeof(u32);
@ -124,7 +127,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
 	for (i = 0; i < num; i++)
 		ceph_encode_32(&p, supported_protocols[i]);

-	ret = ceph_entity_name_encode(ac->name, &p, end);
+	ret = ceph_auth_entity_name_encode(ac->name, &p, end);
 	if (ret < 0)
 		goto out;
 	ceph_decode_need(&p, end, sizeof(u64), bad);
@ -259,9 +262,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
 	int ret = 0;

 	mutex_lock(&ac->mutex);
-	if (!ac->protocol)
-		ret = ceph_auth_build_hello(ac, msg_buf, msg_len);
-	else if (ac->ops->should_authenticate(ac))
+	if (ac->ops->should_authenticate(ac))
 		ret = ceph_build_auth_request(ac, msg_buf, msg_len);
 	mutex_unlock(&ac->mutex);
 	return ret;
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@ -46,7 +46,7 @@ static int ceph_auth_none_build_authorizer(struct ceph_auth_client *ac,
 	int ret;

 	ceph_encode_8_safe(&p, end, 1, e_range);
-	ret = ceph_entity_name_encode(ac->name, &p, end);
+	ret = ceph_auth_entity_name_encode(ac->name, &p, end);
 	if (ret < 0)
 		return ret;

--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@ -566,11 +566,17 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
 }
 EXPORT_SYMBOL(ceph_print_client_options);

-u64 ceph_client_id(struct ceph_client *client)
+struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client)
+{
+	return &client->msgr.inst.addr;
+}
+EXPORT_SYMBOL(ceph_client_addr);
+
+u64 ceph_client_gid(struct ceph_client *client)
 {
 	return client->monc.auth->global_id;
 }
-EXPORT_SYMBOL(ceph_client_id);
+EXPORT_SYMBOL(ceph_client_gid);

 /*
 * create a fresh client instance
@ -685,7 +691,8 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
 			return client->auth_err;
 	}

-	pr_info("client%llu fsid %pU\n", ceph_client_id(client), &client->fsid);
+	pr_info("client%llu fsid %pU\n", ceph_client_gid(client),
+		&client->fsid);
 	ceph_debugfs_client_init(client);

 	return 0;
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@ -15,6 +15,7 @@ const char *ceph_entity_type_name(int type)
 	default: return "unknown";
 	}
 }
+EXPORT_SYMBOL(ceph_entity_type_name);

 const char *ceph_osd_op_name(int op)
 {
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@ -0,0 +1,325 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/types.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/cls_lock_client.h>
+#include <linux/ceph/decode.h>
+
+/**
+ * ceph_cls_lock - grab rados lock for object
+ * @oid, @oloc: object to lock
+ * @lock_name: the name of the lock
+ * @type: lock type (CEPH_CLS_LOCK_EXCLUSIVE or CEPH_CLS_LOCK_SHARED)
+ * @cookie: user-defined identifier for this instance of the lock
+ * @tag: user-defined tag
+ * @desc: user-defined lock description
+ * @flags: lock flags
+ *
+ * All operations on the same lock should use the same tag.
+ */
+int ceph_cls_lock(struct ceph_osd_client *osdc,
+		  struct ceph_object_id *oid,
+		  struct ceph_object_locator *oloc,
+		  char *lock_name, u8 type, char *cookie,
+		  char *tag, char *desc, u8 flags)
+{
+	int lock_op_buf_size;
+	int name_len = strlen(lock_name);
+	int cookie_len = strlen(cookie);
+	int tag_len = strlen(tag);
+	int desc_len = strlen(desc);
+	void *p, *end;
+	struct page *lock_op_page;
+	struct timespec mtime;
+	int ret;
+
+	lock_op_buf_size = name_len + sizeof(__le32) +
+			   cookie_len + sizeof(__le32) +
+			   tag_len + sizeof(__le32) +
+			   desc_len + sizeof(__le32) +
+			   sizeof(struct ceph_timespec) +
+			   /* flag and type */
+			   sizeof(u8) + sizeof(u8) +
+			   CEPH_ENCODING_START_BLK_LEN;
+	if (lock_op_buf_size > PAGE_SIZE)
+		return -E2BIG;
+
+	lock_op_page = alloc_page(GFP_NOIO);
+	if (!lock_op_page)
+		return -ENOMEM;
+
+	p = page_address(lock_op_page);
+	end = p + lock_op_buf_size;
+
+	/* encode cls_lock_lock_op struct */
+	ceph_start_encoding(&p, 1, 1,
+			    lock_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+	ceph_encode_string(&p, end, lock_name, name_len);
+	ceph_encode_8(&p, type);
+	ceph_encode_string(&p, end, cookie, cookie_len);
+	ceph_encode_string(&p, end, tag, tag_len);
+	ceph_encode_string(&p, end, desc, desc_len);
+	/* only support infinite duration */
+	memset(&mtime, 0, sizeof(mtime));
+	ceph_encode_timespec(p, &mtime);
+	p += sizeof(struct ceph_timespec);
+	ceph_encode_8(&p, flags);
+
+	dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n",
+	     __func__, lock_name, type, cookie, tag, desc, flags);
+	ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock",
+			     CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+			     lock_op_page, lock_op_buf_size, NULL, NULL);
+
+	dout("%s: status %d\n", __func__, ret);
+	__free_page(lock_op_page);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_cls_lock);
+
+/**
+ * ceph_cls_unlock - release rados lock for object
+ * @oid, @oloc: object to lock
+ * @lock_name: the name of the lock
+ * @cookie: user-defined identifier for this instance of the lock
+ */
+int ceph_cls_unlock(struct ceph_osd_client *osdc,
+		    struct ceph_object_id *oid,
+		    struct ceph_object_locator *oloc,
+		    char *lock_name, char *cookie)
+{
+	int unlock_op_buf_size;
+	int name_len = strlen(lock_name);
+	int cookie_len = strlen(cookie);
+	void *p, *end;
+	struct page *unlock_op_page;
+	int ret;
+
+	unlock_op_buf_size = name_len + sizeof(__le32) +
+			     cookie_len + sizeof(__le32) +
+			     CEPH_ENCODING_START_BLK_LEN;
+	if (unlock_op_buf_size > PAGE_SIZE)
+		return -E2BIG;
+
+	unlock_op_page = alloc_page(GFP_NOIO);
+	if (!unlock_op_page)
+		return -ENOMEM;
+
+	p = page_address(unlock_op_page);
+	end = p + unlock_op_buf_size;
+
+	/* encode cls_lock_unlock_op struct */
+	ceph_start_encoding(&p, 1, 1,
+			    unlock_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+	ceph_encode_string(&p, end, lock_name, name_len);
+	ceph_encode_string(&p, end, cookie, cookie_len);
+
+	dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie);
+	ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock",
+			     CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+			     unlock_op_page, unlock_op_buf_size, NULL, NULL);
+
+	dout("%s: status %d\n", __func__, ret);
+	__free_page(unlock_op_page);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_cls_unlock);
+
+/**
+ * ceph_cls_break_lock - release rados lock for object for specified client
+ * @oid, @oloc: object to lock
+ * @lock_name: the name of the lock
+ * @cookie: user-defined identifier for this instance of the lock
+ * @locker: current lock owner
+ */
+int ceph_cls_break_lock(struct ceph_osd_client *osdc,
+			struct ceph_object_id *oid,
+			struct ceph_object_locator *oloc,
+			char *lock_name, char *cookie,
+			struct ceph_entity_name *locker)
+{
+	int break_op_buf_size;
+	int name_len = strlen(lock_name);
+	int cookie_len = strlen(cookie);
+	struct page *break_op_page;
+	void *p, *end;
+	int ret;
+
+	break_op_buf_size = name_len + sizeof(__le32) +
+			    cookie_len + sizeof(__le32) +
+			    sizeof(u8) + sizeof(__le64) +
+			    CEPH_ENCODING_START_BLK_LEN;
+	if (break_op_buf_size > PAGE_SIZE)
+		return -E2BIG;
+
+	break_op_page = alloc_page(GFP_NOIO);
+	if (!break_op_page)
+		return -ENOMEM;
+
+	p = page_address(break_op_page);
+	end = p + break_op_buf_size;
+
+	/* encode cls_lock_break_op struct */
+	ceph_start_encoding(&p, 1, 1,
+			    break_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+	ceph_encode_string(&p, end, lock_name, name_len);
+	ceph_encode_copy(&p, locker, sizeof(*locker));
+	ceph_encode_string(&p, end, cookie, cookie_len);
+
+	dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name,
+	     cookie, ENTITY_NAME(*locker));
+	ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock",
+			     CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+			     break_op_page, break_op_buf_size, NULL, NULL);
+
+	dout("%s: status %d\n", __func__, ret);
+	__free_page(break_op_page);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_cls_break_lock);
+
+void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers)
+{
+	int i;
+
+	for (i = 0; i < num_lockers; i++)
+		kfree(lockers[i].id.cookie);
+	kfree(lockers);
+}
+EXPORT_SYMBOL(ceph_free_lockers);
+
+static int decode_locker(void **p, void *end, struct ceph_locker *locker)
+{
+	u8 struct_v;
+	u32 len;
+	char *s;
+	int ret;
+
+	ret = ceph_start_decoding(p, end, 1, "locker_id_t", &struct_v, &len);
+	if (ret)
+		return ret;
+
+	ceph_decode_copy(p, &locker->id.name, sizeof(locker->id.name));
+	s = ceph_extract_encoded_string(p, end, NULL, GFP_NOIO);
+	if (IS_ERR(s))
+		return PTR_ERR(s);
+
+	locker->id.cookie = s;
+
+	ret = ceph_start_decoding(p, end, 1, "locker_info_t", &struct_v, &len);
+	if (ret)
+		return ret;
+
+	*p += sizeof(struct ceph_timespec); /* skip expiration */
+	ceph_decode_copy(p, &locker->info.addr, sizeof(locker->info.addr));
+	ceph_decode_addr(&locker->info.addr);
+	len = ceph_decode_32(p);
+	*p += len; /* skip description */
+
+	dout("%s %s%llu cookie %s addr %s\n", __func__,
+	     ENTITY_NAME(locker->id.name), locker->id.cookie,
+	     ceph_pr_addr(&locker->info.addr.in_addr));
+	return 0;
+}
+
+static int decode_lockers(void **p, void *end, u8 *type, char **tag,
+			  struct ceph_locker **lockers, u32 *num_lockers)
+{
+	u8 struct_v;
+	u32 struct_len;
+	char *s;
+	int i;
+	int ret;
+
+	ret = ceph_start_decoding(p, end, 1, "cls_lock_get_info_reply",
+				  &struct_v, &struct_len);
+	if (ret)
+		return ret;
+
+	*num_lockers = ceph_decode_32(p);
+	*lockers = kcalloc(*num_lockers, sizeof(**lockers), GFP_NOIO);
+	if (!*lockers)
+		return -ENOMEM;
+
+	for (i = 0; i < *num_lockers; i++) {
+		ret = decode_locker(p, end, *lockers + i);
+		if (ret)
+			goto err_free_lockers;
+	}
+
+	*type = ceph_decode_8(p);
+	s = ceph_extract_encoded_string(p, end, NULL, GFP_NOIO);
+	if (IS_ERR(s)) {
+		ret = PTR_ERR(s);
+		goto err_free_lockers;
+	}
+
+	*tag = s;
+	return 0;
+
+err_free_lockers:
+	ceph_free_lockers(*lockers, *num_lockers);
+	return ret;
+}
+
+/*
+ * On success, the caller is responsible for:
+ *
+ *     kfree(tag);
+ *     ceph_free_lockers(lockers, num_lockers);
+ */
+int ceph_cls_lock_info(struct ceph_osd_client *osdc,
+		       struct ceph_object_id *oid,
+		       struct ceph_object_locator *oloc,
+		       char *lock_name, u8 *type, char **tag,
+		       struct ceph_locker **lockers, u32 *num_lockers)
+{
+	int get_info_op_buf_size;
+	int name_len = strlen(lock_name);
+	struct page *get_info_op_page, *reply_page;
+	size_t reply_len;
+	void *p, *end;
+	int ret;
+
+	get_info_op_buf_size = name_len + sizeof(__le32) +
+			       CEPH_ENCODING_START_BLK_LEN;
+	if (get_info_op_buf_size > PAGE_SIZE)
+		return -E2BIG;
+
+	get_info_op_page = alloc_page(GFP_NOIO);
+	if (!get_info_op_page)
+		return -ENOMEM;
+
+	reply_page = alloc_page(GFP_NOIO);
+	if (!reply_page) {
+		__free_page(get_info_op_page);
+		return -ENOMEM;
+	}
+
+	p = page_address(get_info_op_page);
+	end = p + get_info_op_buf_size;
+
+	/* encode cls_lock_get_info_op struct */
+	ceph_start_encoding(&p, 1, 1,
+			    get_info_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+	ceph_encode_string(&p, end, lock_name, name_len);
+
+	dout("%s lock_name %s\n", __func__, lock_name);
+	ret = ceph_osdc_call(osdc, oid, oloc, "lock", "get_info",
+			     CEPH_OSD_FLAG_READ, get_info_op_page,
+			     get_info_op_buf_size, reply_page, &reply_len);
+
+	dout("%s: status %d\n", __func__, ret);
+	if (ret >= 0) {
+		p = page_address(reply_page);
+		end = p + reply_len;
+
+		ret = decode_lockers(&p, end, type, tag, lockers, num_lockers);
+	}
+
+	__free_page(get_info_op_page);
+	__free_page(reply_page);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_cls_lock_info);
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@ -245,7 +245,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
 /* compute 2^44*log2(input+1) */
 static __u64 crush_ln(unsigned int xin)
 {
-	unsigned int x = xin, x1;
+	unsigned int x = xin;
 	int iexpon, index1, index2;
 	__u64 RH, LH, LL, xl64, result;

@ -253,9 +253,15 @@ static __u64 crush_ln(unsigned int xin)

 	/* normalize input */
 	iexpon = 15;
-	while (!(x & 0x18000)) {
-		x <<= 1;
-		iexpon--;
+
+	/*
+	 * figure out number of bits we need to shift and
+	 * do it in one step instead of iteratively
+	 */
+	if (!(x & 0x18000)) {
+		int bits = __builtin_clz(x & 0x1FFFF) - 16;
+		x <<= bits;
+		iexpon = 15 - bits;
 	}

 	index1 = (x >> 8) << 1;
@ -267,12 +273,11 @@ static __u64 crush_ln(unsigned int xin)
 	/* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
 	xl64 = (__s64)x * RH;
 	xl64 >>= 48;
-	x1 = xl64;

 	result = iexpon;
 	result <<= (12 + 32);

-	index2 = x1 & 0xff;
+	index2 = xl64 & 0xff;
 	/* LL ~ 2^48*log2(1.0+index2/2^15) */
 	LL = __LL_tbl[index2];

--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@ -835,6 +835,83 @@ int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
 }
 EXPORT_SYMBOL(ceph_monc_get_version_async);

+static void handle_command_ack(struct ceph_mon_client *monc,
+			       struct ceph_msg *msg)
+{
+	struct ceph_mon_generic_request *req;
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front_alloc_len;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+
+	dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
+	ceph_decode_need(&p, end, sizeof(struct ceph_mon_request_header) +
+							    sizeof(u32), bad);
+	p += sizeof(struct ceph_mon_request_header);
+
+	mutex_lock(&monc->mutex);
+	req = lookup_generic_request(&monc->generic_request_tree, tid);
+	if (!req) {
+		mutex_unlock(&monc->mutex);
+		return;
+	}
+
+	req->result = ceph_decode_32(&p);
+	__finish_generic_request(req);
+	mutex_unlock(&monc->mutex);
+
+	complete_generic_request(req);
+	return;
+
+bad:
+	pr_err("corrupt mon_command ack, tid %llu\n", tid);
+	ceph_msg_dump(msg);
+}
+
+int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
+			    struct ceph_entity_addr *client_addr)
+{
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_command *h;
+	int ret = -ENOMEM;
+	int len;
+
+	req = alloc_generic_request(monc, GFP_NOIO);
+	if (!req)
+		goto out;
+
+	req->request = ceph_msg_new(CEPH_MSG_MON_COMMAND, 256, GFP_NOIO, true);
+	if (!req->request)
+		goto out;
+
+	req->reply = ceph_msg_new(CEPH_MSG_MON_COMMAND_ACK, 512, GFP_NOIO,
+				  true);
+	if (!req->reply)
+		goto out;
+
+	mutex_lock(&monc->mutex);
+	register_generic_request(req);
+	h = req->request->front.iov_base;
+	h->monhdr.have_version = 0;
+	h->monhdr.session_mon = cpu_to_le16(-1);
+	h->monhdr.session_mon_tid = 0;
+	h->fsid = monc->monmap->fsid;
+	h->num_strs = cpu_to_le32(1);
+	len = sprintf(h->str, "{ \"prefix\": \"osd blacklist\", \
+		                 \"blacklistop\": \"add\", \
+				 \"addr\": \"%pISpc/%u\" }",
+		      &client_addr->in_addr, le32_to_cpu(client_addr->nonce));
+	h->str_len = cpu_to_le32(len);
+	send_generic_request(monc, req);
+	mutex_unlock(&monc->mutex);
+
+	ret = wait_generic_request(req);
+out:
+	put_generic_request(req);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_monc_blacklist_add);
+
 /*
 * Resend pending generic requests.
 */
@ -1139,6 +1216,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 		handle_get_version_reply(monc, msg);
 		break;

+	case CEPH_MSG_MON_COMMAND_ACK:
+		handle_command_ack(monc, msg);
+		break;
+
 	case CEPH_MSG_MON_MAP:
 		ceph_monc_handle_map(monc, msg);
 		break;
@ -1178,6 +1259,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 		m = ceph_msg_get(monc->m_subscribe_ack);
 		break;
 	case CEPH_MSG_STATFS_REPLY:
+	case CEPH_MSG_MON_COMMAND_ACK:
 		return get_generic_reply(con, hdr, skip);
 	case CEPH_MSG_AUTH_REPLY:
 		m = ceph_msg_get(monc->m_auth_reply);
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@ -338,6 +338,9 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 		ceph_osd_data_release(&op->notify.request_data);
 		ceph_osd_data_release(&op->notify.response_data);
 		break;
+	case CEPH_OSD_OP_LIST_WATCHERS:
+		ceph_osd_data_release(&op->list_watchers.response_data);
+		break;
 	default:
 		break;
 	}
@ -863,6 +866,8 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
 	case CEPH_OSD_OP_NOTIFY:
 		dst->notify.cookie = cpu_to_le64(src->notify.cookie);
 		break;
+	case CEPH_OSD_OP_LIST_WATCHERS:
+		break;
 	case CEPH_OSD_OP_SETALLOCHINT:
 		dst->alloc_hint.expected_object_size =
 		    cpu_to_le64(src->alloc_hint.expected_object_size);
@ -1445,6 +1450,10 @@ static void setup_request_data(struct ceph_osd_request *req,
 			ceph_osdc_msg_data_add(req->r_reply,
 					       &op->extent.osd_data);
 			break;
+		case CEPH_OSD_OP_LIST_WATCHERS:
+			ceph_osdc_msg_data_add(req->r_reply,
+					       &op->list_watchers.response_data);
+			break;

 		/* both */
 		case CEPH_OSD_OP_CALL:
@ -3891,12 +3900,121 @@ int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
 	return ret;
 }

+static int decode_watcher(void **p, void *end, struct ceph_watch_item *item)
+{
+	u8 struct_v;
+	u32 struct_len;
+	int ret;
+
+	ret = ceph_start_decoding(p, end, 2, "watch_item_t",
+				  &struct_v, &struct_len);
+	if (ret)
+		return ret;
+
+	ceph_decode_copy(p, &item->name, sizeof(item->name));
+	item->cookie = ceph_decode_64(p);
+	*p += 4; /* skip timeout_seconds */
+	if (struct_v >= 2) {
+		ceph_decode_copy(p, &item->addr, sizeof(item->addr));
+		ceph_decode_addr(&item->addr);
+	}
+
+	dout("%s %s%llu cookie %llu addr %s\n", __func__,
+	     ENTITY_NAME(item->name), item->cookie,
+	     ceph_pr_addr(&item->addr.in_addr));
+	return 0;
+}
+
+static int decode_watchers(void **p, void *end,
+			   struct ceph_watch_item **watchers,
+			   u32 *num_watchers)
+{
+	u8 struct_v;
+	u32 struct_len;
+	int i;
+	int ret;
+
+	ret = ceph_start_decoding(p, end, 1, "obj_list_watch_response_t",
+				  &struct_v, &struct_len);
+	if (ret)
+		return ret;
+
+	*num_watchers = ceph_decode_32(p);
+	*watchers = kcalloc(*num_watchers, sizeof(**watchers), GFP_NOIO);
+	if (!*watchers)
+		return -ENOMEM;
+
+	for (i = 0; i < *num_watchers; i++) {
+		ret = decode_watcher(p, end, *watchers + i);
+		if (ret) {
+			kfree(*watchers);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * On success, the caller is responsible for:
+ *
+ *     kfree(watchers);
+ */
+int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
+			    struct ceph_object_id *oid,
+			    struct ceph_object_locator *oloc,
+			    struct ceph_watch_item **watchers,
+			    u32 *num_watchers)
+{
+	struct ceph_osd_request *req;
+	struct page **pages;
+	int ret;
+
+	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+	if (!req)
+		return -ENOMEM;
+
+	ceph_oid_copy(&req->r_base_oid, oid);
+	ceph_oloc_copy(&req->r_base_oloc, oloc);
+	req->r_flags = CEPH_OSD_FLAG_READ;
+
+	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+	if (ret)
+		goto out_put_req;
+
+	pages = ceph_alloc_page_vector(1, GFP_NOIO);
+	if (IS_ERR(pages)) {
+		ret = PTR_ERR(pages);
+		goto out_put_req;
+	}
+
+	osd_req_op_init(req, 0, CEPH_OSD_OP_LIST_WATCHERS, 0);
+	ceph_osd_data_pages_init(osd_req_op_data(req, 0, list_watchers,
+						 response_data),
+				 pages, PAGE_SIZE, 0, false, true);
+
+	ceph_osdc_start_request(osdc, req, false);
+	ret = ceph_osdc_wait_request(osdc, req);
+	if (ret >= 0) {
+		void *p = page_address(pages[0]);
+		void *const end = p + req->r_ops[0].outdata_len;
+
+		ret = decode_watchers(&p, end, watchers, num_watchers);
+	}
+
+out_put_req:
+	ceph_osdc_put_request(req);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_list_watchers);
+
 /*
 * Call all pending notify callbacks - for use after a watch is
 * unregistered, to make sure no more callbacks for it will be invoked
 */
 void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
 {
+	dout("%s osdc %p\n", __func__, osdc);
 	flush_workqueue(osdc->notify_wq);
 }
 EXPORT_SYMBOL(ceph_osdc_flush_notifies);
@ -3909,6 +4027,57 @@ void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
 }
 EXPORT_SYMBOL(ceph_osdc_maybe_request_map);

+/*
+ * Execute an OSD class method on an object.
+ *
+ * @flags: CEPH_OSD_FLAG_*
+ * @resp_len: out param for reply length
+ */
+int ceph_osdc_call(struct ceph_osd_client *osdc,
+		   struct ceph_object_id *oid,
+		   struct ceph_object_locator *oloc,
+		   const char *class, const char *method,
+		   unsigned int flags,
+		   struct page *req_page, size_t req_len,
+		   struct page *resp_page, size_t *resp_len)
+{
+	struct ceph_osd_request *req;
+	int ret;
+
+	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+	if (!req)
+		return -ENOMEM;
+
+	ceph_oid_copy(&req->r_base_oid, oid);
+	ceph_oloc_copy(&req->r_base_oloc, oloc);
+	req->r_flags = flags;
+
+	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+	if (ret)
+		goto out_put_req;
+
+	osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method);
+	if (req_page)
+		osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len,
+						  0, false, false);
+	if (resp_page)
+		osd_req_op_cls_response_data_pages(req, 0, &resp_page,
+						   PAGE_SIZE, 0, false, false);
+
+	ceph_osdc_start_request(osdc, req, false);
+	ret = ceph_osdc_wait_request(osdc, req);
+	if (ret >= 0) {
+		ret = req->r_ops[0].rval;
+		if (resp_page)
+			*resp_len = req->r_ops[0].outdata_len;
+	}
+
+out_put_req:
+	ceph_osdc_put_request(req);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_call);
+
 /*
 * init, shutdown
 */