diff --git a/MAINTAINERS b/MAINTAINERS index f2a2b8e647c5..3d4179fbc526 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1527,6 +1527,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git S: Supported F: Documentation/filesystems/ceph.txt F: fs/ceph +F: net/ceph +F: include/linux/ceph CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM: M: David Vrabel @@ -4805,6 +4807,15 @@ F: fs/qnx4/ F: include/linux/qnx4_fs.h F: include/linux/qnxtypes.h +RADOS BLOCK DEVICE (RBD) +F: include/linux/qnxtypes.h +M: Yehuda Sadeh +M: Sage Weil +M: ceph-devel@vger.kernel.org +S: Supported +F: drivers/block/rbd.c +F: drivers/block/rbd_types.h + RADEON FRAMEBUFFER DISPLAY DRIVER M: Benjamin Herrenschmidt L: linux-fbdev@vger.kernel.org diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index de277689da61..4b9359a6f6ca 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -488,4 +488,21 @@ config BLK_DEV_HD If unsure, say N. +config BLK_DEV_RBD + tristate "Rados block device (RBD)" + depends on INET && EXPERIMENTAL && BLOCK + select CEPH_LIB + select LIBCRC32C + select CRYPTO_AES + select CRYPTO + default n + help + Say Y here if you want include the Rados block device, which stripes + a block device over objects stored in the Ceph distributed object + store. + + More information at http://ceph.newdream.net/. + + If unsure, say N. + endif # BLK_DEV diff --git a/drivers/block/Makefile b/drivers/block/Makefile index aff5ac925c34..d7f463d6312d 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -37,5 +37,6 @@ obj-$(CONFIG_BLK_DEV_HD) += hd.o obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ +obj-$(CONFIG_BLK_DEV_RBD) += rbd.o swim_mod-objs := swim.o swim_asm.o diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c new file mode 100644 index 000000000000..6ec9d53806c5 --- /dev/null +++ b/drivers/block/rbd.c @@ -0,0 +1,1841 @@ +/* + rbd.c -- Export ceph rados objects as a Linux block device + + + based on drivers/block/osdblk.c: + + Copyright 2009 Red Hat, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + + + Instructions for use + -------------------- + + 1) Map a Linux block device to an existing rbd image. + + Usage: [snap name] + + $ echo "192.168.0.1 name=admin rbd foo" > /sys/class/rbd/add + + The snapshot name can be "-" or omitted to map the image read/write. + + 2) List all active blkdev<->object mappings. + + In this example, we have performed step #1 twice, creating two blkdevs, + mapped to two separate rados objects in the rados rbd pool + + $ cat /sys/class/rbd/list + #id major client_name pool name snap KB + 0 254 client4143 rbd foo - 1024000 + + The columns, in order, are: + - blkdev unique id + - blkdev assigned major + - rados client id + - rados pool name + - rados block device name + - mapped snapshot ("-" if none) + - device size in KB + + + 3) Create a snapshot. + + Usage: + + $ echo "0 mysnap" > /sys/class/rbd/snap_create + + + 4) Listing a snapshot. + + $ cat /sys/class/rbd/snaps_list + #id snap KB + 0 - 1024000 (*) + 0 foo 1024000 + + The columns, in order, are: + - blkdev unique id + - snapshot name, '-' means none (active read/write version) + - size of device at time of snapshot + - the (*) indicates this is the active version + + 5) Rollback to snapshot. + + Usage: + + $ echo "0 mysnap" > /sys/class/rbd/snap_rollback + + + 6) Mapping an image using snapshot. + + A snapshot mapping is read-only. This is being done by passing + snap= to the options when adding a device. + + $ echo "192.168.0.1 name=admin,snap=mysnap rbd foo" > /sys/class/rbd/add + + + 7) Remove an active blkdev<->rbd image mapping. + + In this example, we remove the mapping with blkdev unique id 1. + + $ echo 1 > /sys/class/rbd/remove + + + NOTE: The actual creation and deletion of rados objects is outside the scope + of this driver. + + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "rbd_types.h" + +#define DRV_NAME "rbd" +#define DRV_NAME_LONG "rbd (rados block device)" + +#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ + +#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX)) +#define RBD_MAX_POOL_NAME_LEN 64 +#define RBD_MAX_SNAP_NAME_LEN 32 +#define RBD_MAX_OPT_LEN 1024 + +#define RBD_SNAP_HEAD_NAME "-" + +#define DEV_NAME_LEN 32 + +/* + * block device image metadata (in-memory version) + */ +struct rbd_image_header { + u64 image_size; + char block_name[32]; + __u8 obj_order; + __u8 crypt_type; + __u8 comp_type; + struct rw_semaphore snap_rwsem; + struct ceph_snap_context *snapc; + size_t snap_names_len; + u64 snap_seq; + u32 total_snaps; + + char *snap_names; + u64 *snap_sizes; +}; + +/* + * an instance of the client. multiple devices may share a client. + */ +struct rbd_client { + struct ceph_client *client; + struct kref kref; + struct list_head node; +}; + +/* + * a single io request + */ +struct rbd_request { + struct request *rq; /* blk layer request */ + struct bio *bio; /* cloned bio */ + struct page **pages; /* list of used pages */ + u64 len; +}; + +/* + * a single device + */ +struct rbd_device { + int id; /* blkdev unique id */ + + int major; /* blkdev assigned major */ + struct gendisk *disk; /* blkdev's gendisk and rq */ + struct request_queue *q; + + struct ceph_client *client; + struct rbd_client *rbd_client; + + char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ + + spinlock_t lock; /* queue lock */ + + struct rbd_image_header header; + char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */ + int obj_len; + char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */ + char pool_name[RBD_MAX_POOL_NAME_LEN]; + int poolid; + + char snap_name[RBD_MAX_SNAP_NAME_LEN]; + u32 cur_snap; /* index+1 of current snapshot within snap context + 0 - for the head */ + int read_only; + + struct list_head node; +}; + +static spinlock_t node_lock; /* protects client get/put */ + +static struct class *class_rbd; /* /sys/class/rbd */ +static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ +static LIST_HEAD(rbd_dev_list); /* devices */ +static LIST_HEAD(rbd_client_list); /* clients */ + + +static int rbd_open(struct block_device *bdev, fmode_t mode) +{ + struct gendisk *disk = bdev->bd_disk; + struct rbd_device *rbd_dev = disk->private_data; + + set_device_ro(bdev, rbd_dev->read_only); + + if ((mode & FMODE_WRITE) && rbd_dev->read_only) + return -EROFS; + + return 0; +} + +static const struct block_device_operations rbd_bd_ops = { + .owner = THIS_MODULE, + .open = rbd_open, +}; + +/* + * Initialize an rbd client instance. + * We own *opt. + */ +static struct rbd_client *rbd_client_create(struct ceph_options *opt) +{ + struct rbd_client *rbdc; + int ret = -ENOMEM; + + dout("rbd_client_create\n"); + rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); + if (!rbdc) + goto out_opt; + + kref_init(&rbdc->kref); + INIT_LIST_HEAD(&rbdc->node); + + rbdc->client = ceph_create_client(opt, rbdc); + if (IS_ERR(rbdc->client)) + goto out_rbdc; + opt = NULL; /* Now rbdc->client is responsible for opt */ + + ret = ceph_open_session(rbdc->client); + if (ret < 0) + goto out_err; + + spin_lock(&node_lock); + list_add_tail(&rbdc->node, &rbd_client_list); + spin_unlock(&node_lock); + + dout("rbd_client_create created %p\n", rbdc); + return rbdc; + +out_err: + ceph_destroy_client(rbdc->client); +out_rbdc: + kfree(rbdc); +out_opt: + if (opt) + ceph_destroy_options(opt); + return ERR_PTR(ret); +} + +/* + * Find a ceph client with specific addr and configuration. + */ +static struct rbd_client *__rbd_client_find(struct ceph_options *opt) +{ + struct rbd_client *client_node; + + if (opt->flags & CEPH_OPT_NOSHARE) + return NULL; + + list_for_each_entry(client_node, &rbd_client_list, node) + if (ceph_compare_options(opt, client_node->client) == 0) + return client_node; + return NULL; +} + +/* + * Get a ceph client with specific addr and configuration, if one does + * not exist create it. + */ +static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, + char *options) +{ + struct rbd_client *rbdc; + struct ceph_options *opt; + int ret; + + ret = ceph_parse_options(&opt, options, mon_addr, + mon_addr + strlen(mon_addr), NULL, NULL); + if (ret < 0) + return ret; + + spin_lock(&node_lock); + rbdc = __rbd_client_find(opt); + if (rbdc) { + ceph_destroy_options(opt); + + /* using an existing client */ + kref_get(&rbdc->kref); + rbd_dev->rbd_client = rbdc; + rbd_dev->client = rbdc->client; + spin_unlock(&node_lock); + return 0; + } + spin_unlock(&node_lock); + + rbdc = rbd_client_create(opt); + if (IS_ERR(rbdc)) + return PTR_ERR(rbdc); + + rbd_dev->rbd_client = rbdc; + rbd_dev->client = rbdc->client; + return 0; +} + +/* + * Destroy ceph client + */ +static void rbd_client_release(struct kref *kref) +{ + struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); + + dout("rbd_release_client %p\n", rbdc); + spin_lock(&node_lock); + list_del(&rbdc->node); + spin_unlock(&node_lock); + + ceph_destroy_client(rbdc->client); + kfree(rbdc); +} + +/* + * Drop reference to ceph client node. If it's not referenced anymore, release + * it. + */ +static void rbd_put_client(struct rbd_device *rbd_dev) +{ + kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); + rbd_dev->rbd_client = NULL; + rbd_dev->client = NULL; +} + + +/* + * Create a new header structure, translate header format from the on-disk + * header. + */ +static int rbd_header_from_disk(struct rbd_image_header *header, + struct rbd_image_header_ondisk *ondisk, + int allocated_snaps, + gfp_t gfp_flags) +{ + int i; + u32 snap_count = le32_to_cpu(ondisk->snap_count); + int ret = -ENOMEM; + + init_rwsem(&header->snap_rwsem); + + header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); + header->snapc = kmalloc(sizeof(struct ceph_snap_context) + + snap_count * + sizeof(struct rbd_image_snap_ondisk), + gfp_flags); + if (!header->snapc) + return -ENOMEM; + if (snap_count) { + header->snap_names = kmalloc(header->snap_names_len, + GFP_KERNEL); + if (!header->snap_names) + goto err_snapc; + header->snap_sizes = kmalloc(snap_count * sizeof(u64), + GFP_KERNEL); + if (!header->snap_sizes) + goto err_names; + } else { + header->snap_names = NULL; + header->snap_sizes = NULL; + } + memcpy(header->block_name, ondisk->block_name, + sizeof(ondisk->block_name)); + + header->image_size = le64_to_cpu(ondisk->image_size); + header->obj_order = ondisk->options.order; + header->crypt_type = ondisk->options.crypt_type; + header->comp_type = ondisk->options.comp_type; + + atomic_set(&header->snapc->nref, 1); + header->snap_seq = le64_to_cpu(ondisk->snap_seq); + header->snapc->num_snaps = snap_count; + header->total_snaps = snap_count; + + if (snap_count && + allocated_snaps == snap_count) { + for (i = 0; i < snap_count; i++) { + header->snapc->snaps[i] = + le64_to_cpu(ondisk->snaps[i].id); + header->snap_sizes[i] = + le64_to_cpu(ondisk->snaps[i].image_size); + } + + /* copy snapshot names */ + memcpy(header->snap_names, &ondisk->snaps[i], + header->snap_names_len); + } + + return 0; + +err_names: + kfree(header->snap_names); +err_snapc: + kfree(header->snapc); + return ret; +} + +static int snap_index(struct rbd_image_header *header, int snap_num) +{ + return header->total_snaps - snap_num; +} + +static u64 cur_snap_id(struct rbd_device *rbd_dev) +{ + struct rbd_image_header *header = &rbd_dev->header; + + if (!rbd_dev->cur_snap) + return 0; + + return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)]; +} + +static int snap_by_name(struct rbd_image_header *header, const char *snap_name, + u64 *seq, u64 *size) +{ + int i; + char *p = header->snap_names; + + for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) { + if (strcmp(snap_name, p) == 0) + break; + } + if (i == header->total_snaps) + return -ENOENT; + if (seq) + *seq = header->snapc->snaps[i]; + + if (size) + *size = header->snap_sizes[i]; + + return i; +} + +static int rbd_header_set_snap(struct rbd_device *dev, + const char *snap_name, + u64 *size) +{ + struct rbd_image_header *header = &dev->header; + struct ceph_snap_context *snapc = header->snapc; + int ret = -ENOENT; + + down_write(&header->snap_rwsem); + + if (!snap_name || + !*snap_name || + strcmp(snap_name, "-") == 0 || + strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) { + if (header->total_snaps) + snapc->seq = header->snap_seq; + else + snapc->seq = 0; + dev->cur_snap = 0; + dev->read_only = 0; + if (size) + *size = header->image_size; + } else { + ret = snap_by_name(header, snap_name, &snapc->seq, size); + if (ret < 0) + goto done; + + dev->cur_snap = header->total_snaps - ret; + dev->read_only = 1; + } + + ret = 0; +done: + up_write(&header->snap_rwsem); + return ret; +} + +static void rbd_header_free(struct rbd_image_header *header) +{ + kfree(header->snapc); + kfree(header->snap_names); + kfree(header->snap_sizes); +} + +/* + * get the actual striped segment name, offset and length + */ +static u64 rbd_get_segment(struct rbd_image_header *header, + const char *block_name, + u64 ofs, u64 len, + char *seg_name, u64 *segofs) +{ + u64 seg = ofs >> header->obj_order; + + if (seg_name) + snprintf(seg_name, RBD_MAX_SEG_NAME_LEN, + "%s.%012llx", block_name, seg); + + ofs = ofs & ((1 << header->obj_order) - 1); + len = min_t(u64, len, (1 << header->obj_order) - ofs); + + if (segofs) + *segofs = ofs; + + return len; +} + +/* + * bio helpers + */ + +static void bio_chain_put(struct bio *chain) +{ + struct bio *tmp; + + while (chain) { + tmp = chain; + chain = chain->bi_next; + bio_put(tmp); + } +} + +/* + * zeros a bio chain, starting at specific offset + */ +static void zero_bio_chain(struct bio *chain, int start_ofs) +{ + struct bio_vec *bv; + unsigned long flags; + void *buf; + int i; + int pos = 0; + + while (chain) { + bio_for_each_segment(bv, chain, i) { + if (pos + bv->bv_len > start_ofs) { + int remainder = max(start_ofs - pos, 0); + buf = bvec_kmap_irq(bv, &flags); + memset(buf + remainder, 0, + bv->bv_len - remainder); + bvec_kunmap_irq(buf, &flags); + } + pos += bv->bv_len; + } + + chain = chain->bi_next; + } +} + +/* + * bio_chain_clone - clone a chain of bios up to a certain length. + * might return a bio_pair that will need to be released. + */ +static struct bio *bio_chain_clone(struct bio **old, struct bio **next, + struct bio_pair **bp, + int len, gfp_t gfpmask) +{ + struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL; + int total = 0; + + if (*bp) { + bio_pair_release(*bp); + *bp = NULL; + } + + while (old_chain && (total < len)) { + tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); + if (!tmp) + goto err_out; + + if (total + old_chain->bi_size > len) { + struct bio_pair *bp; + + /* + * this split can only happen with a single paged bio, + * split_bio will BUG_ON if this is not the case + */ + dout("bio_chain_clone split! total=%d remaining=%d" + "bi_size=%d\n", + (int)total, (int)len-total, + (int)old_chain->bi_size); + + /* split the bio. We'll release it either in the next + call, or it will have to be released outside */ + bp = bio_split(old_chain, (len - total) / 512ULL); + if (!bp) + goto err_out; + + __bio_clone(tmp, &bp->bio1); + + *next = &bp->bio2; + } else { + __bio_clone(tmp, old_chain); + *next = old_chain->bi_next; + } + + tmp->bi_bdev = NULL; + gfpmask &= ~__GFP_WAIT; + tmp->bi_next = NULL; + + if (!new_chain) { + new_chain = tail = tmp; + } else { + tail->bi_next = tmp; + tail = tmp; + } + old_chain = old_chain->bi_next; + + total += tmp->bi_size; + } + + BUG_ON(total < len); + + if (tail) + tail->bi_next = NULL; + + *old = old_chain; + + return new_chain; + +err_out: + dout("bio_chain_clone with err\n"); + bio_chain_put(new_chain); + return NULL; +} + +/* + * helpers for osd request op vectors. + */ +static int rbd_create_rw_ops(struct ceph_osd_req_op **ops, + int num_ops, + int opcode, + u32 payload_len) +{ + *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1), + GFP_NOIO); + if (!*ops) + return -ENOMEM; + (*ops)[0].op = opcode; + /* + * op extent offset and length will be set later on + * in calc_raw_layout() + */ + (*ops)[0].payload_len = payload_len; + return 0; +} + +static void rbd_destroy_ops(struct ceph_osd_req_op *ops) +{ + kfree(ops); +} + +/* + * Send ceph osd request + */ +static int rbd_do_request(struct request *rq, + struct rbd_device *dev, + struct ceph_snap_context *snapc, + u64 snapid, + const char *obj, u64 ofs, u64 len, + struct bio *bio, + struct page **pages, + int num_pages, + int flags, + struct ceph_osd_req_op *ops, + int num_reply, + void (*rbd_cb)(struct ceph_osd_request *req, + struct ceph_msg *msg)) +{ + struct ceph_osd_request *req; + struct ceph_file_layout *layout; + int ret; + u64 bno; + struct timespec mtime = CURRENT_TIME; + struct rbd_request *req_data; + struct ceph_osd_request_head *reqhead; + struct rbd_image_header *header = &dev->header; + + ret = -ENOMEM; + req_data = kzalloc(sizeof(*req_data), GFP_NOIO); + if (!req_data) + goto done; + + dout("rbd_do_request len=%lld ofs=%lld\n", len, ofs); + + down_read(&header->snap_rwsem); + + req = ceph_osdc_alloc_request(&dev->client->osdc, flags, + snapc, + ops, + false, + GFP_NOIO, pages, bio); + if (IS_ERR(req)) { + up_read(&header->snap_rwsem); + ret = PTR_ERR(req); + goto done_pages; + } + + req->r_callback = rbd_cb; + + req_data->rq = rq; + req_data->bio = bio; + req_data->pages = pages; + req_data->len = len; + + req->r_priv = req_data; + + reqhead = req->r_request->front.iov_base; + reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); + + strncpy(req->r_oid, obj, sizeof(req->r_oid)); + req->r_oid_len = strlen(req->r_oid); + + layout = &req->r_file_layout; + memset(layout, 0, sizeof(*layout)); + layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + layout->fl_stripe_count = cpu_to_le32(1); + layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + layout->fl_pg_preferred = cpu_to_le32(-1); + layout->fl_pg_pool = cpu_to_le32(dev->poolid); + ceph_calc_raw_layout(&dev->client->osdc, layout, snapid, + ofs, &len, &bno, req, ops); + + ceph_osdc_build_request(req, ofs, &len, + ops, + snapc, + &mtime, + req->r_oid, req->r_oid_len); + up_read(&header->snap_rwsem); + + ret = ceph_osdc_start_request(&dev->client->osdc, req, false); + if (ret < 0) + goto done_err; + + if (!rbd_cb) { + ret = ceph_osdc_wait_request(&dev->client->osdc, req); + ceph_osdc_put_request(req); + } + return ret; + +done_err: + bio_chain_put(req_data->bio); + ceph_osdc_put_request(req); +done_pages: + kfree(req_data); +done: + if (rq) + blk_end_request(rq, ret, len); + return ret; +} + +/* + * Ceph osd op callback + */ +static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) +{ + struct rbd_request *req_data = req->r_priv; + struct ceph_osd_reply_head *replyhead; + struct ceph_osd_op *op; + __s32 rc; + u64 bytes; + int read_op; + + /* parse reply */ + replyhead = msg->front.iov_base; + WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); + op = (void *)(replyhead + 1); + rc = le32_to_cpu(replyhead->result); + bytes = le64_to_cpu(op->extent.length); + read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ); + + dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc); + + if (rc == -ENOENT && read_op) { + zero_bio_chain(req_data->bio, 0); + rc = 0; + } else if (rc == 0 && read_op && bytes < req_data->len) { + zero_bio_chain(req_data->bio, bytes); + bytes = req_data->len; + } + + blk_end_request(req_data->rq, rc, bytes); + + if (req_data->bio) + bio_chain_put(req_data->bio); + + ceph_osdc_put_request(req); + kfree(req_data); +} + +/* + * Do a synchronous ceph osd operation + */ +static int rbd_req_sync_op(struct rbd_device *dev, + struct ceph_snap_context *snapc, + u64 snapid, + int opcode, + int flags, + struct ceph_osd_req_op *orig_ops, + int num_reply, + const char *obj, + u64 ofs, u64 len, + char *buf) +{ + int ret; + struct page **pages; + int num_pages; + struct ceph_osd_req_op *ops = orig_ops; + u32 payload_len; + + num_pages = calc_pages_for(ofs , len); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + if (!orig_ops) { + payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0); + ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); + if (ret < 0) + goto done; + + if ((flags & CEPH_OSD_FLAG_WRITE) && buf) { + ret = ceph_copy_to_page_vector(pages, buf, ofs, len); + if (ret < 0) + goto done_ops; + } + } + + ret = rbd_do_request(NULL, dev, snapc, snapid, + obj, ofs, len, NULL, + pages, num_pages, + flags, + ops, + 2, + NULL); + if (ret < 0) + goto done_ops; + + if ((flags & CEPH_OSD_FLAG_READ) && buf) + ret = ceph_copy_from_page_vector(pages, buf, ofs, ret); + +done_ops: + if (!orig_ops) + rbd_destroy_ops(ops); +done: + ceph_release_page_vector(pages, num_pages); + return ret; +} + +/* + * Do an asynchronous ceph osd operation + */ +static int rbd_do_op(struct request *rq, + struct rbd_device *rbd_dev , + struct ceph_snap_context *snapc, + u64 snapid, + int opcode, int flags, int num_reply, + u64 ofs, u64 len, + struct bio *bio) +{ + char *seg_name; + u64 seg_ofs; + u64 seg_len; + int ret; + struct ceph_osd_req_op *ops; + u32 payload_len; + + seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); + if (!seg_name) + return -ENOMEM; + + seg_len = rbd_get_segment(&rbd_dev->header, + rbd_dev->header.block_name, + ofs, len, + seg_name, &seg_ofs); + + payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); + + ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); + if (ret < 0) + goto done; + + /* we've taken care of segment sizes earlier when we + cloned the bios. We should never have a segment + truncated at this point */ + BUG_ON(seg_len < len); + + ret = rbd_do_request(rq, rbd_dev, snapc, snapid, + seg_name, seg_ofs, seg_len, + bio, + NULL, 0, + flags, + ops, + num_reply, + rbd_req_cb); +done: + kfree(seg_name); + return ret; +} + +/* + * Request async osd write + */ +static int rbd_req_write(struct request *rq, + struct rbd_device *rbd_dev, + struct ceph_snap_context *snapc, + u64 ofs, u64 len, + struct bio *bio) +{ + return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, + 2, + ofs, len, bio); +} + +/* + * Request async osd read + */ +static int rbd_req_read(struct request *rq, + struct rbd_device *rbd_dev, + u64 snapid, + u64 ofs, u64 len, + struct bio *bio) +{ + return rbd_do_op(rq, rbd_dev, NULL, + (snapid ? snapid : CEPH_NOSNAP), + CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, + 2, + ofs, len, bio); +} + +/* + * Request sync osd read + */ +static int rbd_req_sync_read(struct rbd_device *dev, + struct ceph_snap_context *snapc, + u64 snapid, + const char *obj, + u64 ofs, u64 len, + char *buf) +{ + return rbd_req_sync_op(dev, NULL, + (snapid ? snapid : CEPH_NOSNAP), + CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, + NULL, + 1, obj, ofs, len, buf); +} + +/* + * Request sync osd read + */ +static int rbd_req_sync_rollback_obj(struct rbd_device *dev, + u64 snapid, + const char *obj) +{ + struct ceph_osd_req_op *ops; + int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0); + if (ret < 0) + return ret; + + ops[0].snap.snapid = snapid; + + ret = rbd_req_sync_op(dev, NULL, + CEPH_NOSNAP, + 0, + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, + ops, + 1, obj, 0, 0, NULL); + + rbd_destroy_ops(ops); + + if (ret < 0) + return ret; + + return ret; +} + +/* + * Request sync osd read + */ +static int rbd_req_sync_exec(struct rbd_device *dev, + const char *obj, + const char *cls, + const char *method, + const char *data, + int len) +{ + struct ceph_osd_req_op *ops; + int cls_len = strlen(cls); + int method_len = strlen(method); + int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL, + cls_len + method_len + len); + if (ret < 0) + return ret; + + ops[0].cls.class_name = cls; + ops[0].cls.class_len = (__u8)cls_len; + ops[0].cls.method_name = method; + ops[0].cls.method_len = (__u8)method_len; + ops[0].cls.argc = 0; + ops[0].cls.indata = data; + ops[0].cls.indata_len = len; + + ret = rbd_req_sync_op(dev, NULL, + CEPH_NOSNAP, + 0, + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, + ops, + 1, obj, 0, 0, NULL); + + rbd_destroy_ops(ops); + + dout("cls_exec returned %d\n", ret); + return ret; +} + +/* + * block device queue callback + */ +static void rbd_rq_fn(struct request_queue *q) +{ + struct rbd_device *rbd_dev = q->queuedata; + struct request *rq; + struct bio_pair *bp = NULL; + + rq = blk_fetch_request(q); + + while (1) { + struct bio *bio; + struct bio *rq_bio, *next_bio = NULL; + bool do_write; + int size, op_size = 0; + u64 ofs; + + /* peek at request from block layer */ + if (!rq) + break; + + dout("fetched request\n"); + + /* filter out block requests we don't understand */ + if ((rq->cmd_type != REQ_TYPE_FS)) { + __blk_end_request_all(rq, 0); + goto next; + } + + /* deduce our operation (read, write) */ + do_write = (rq_data_dir(rq) == WRITE); + + size = blk_rq_bytes(rq); + ofs = blk_rq_pos(rq) * 512ULL; + rq_bio = rq->bio; + if (do_write && rbd_dev->read_only) { + __blk_end_request_all(rq, -EROFS); + goto next; + } + + spin_unlock_irq(q->queue_lock); + + dout("%s 0x%x bytes at 0x%llx\n", + do_write ? "write" : "read", + size, blk_rq_pos(rq) * 512ULL); + + do { + /* a bio clone to be passed down to OSD req */ + dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt); + op_size = rbd_get_segment(&rbd_dev->header, + rbd_dev->header.block_name, + ofs, size, + NULL, NULL); + bio = bio_chain_clone(&rq_bio, &next_bio, &bp, + op_size, GFP_ATOMIC); + if (!bio) { + spin_lock_irq(q->queue_lock); + __blk_end_request_all(rq, -ENOMEM); + goto next; + } + + /* init OSD command: write or read */ + if (do_write) + rbd_req_write(rq, rbd_dev, + rbd_dev->header.snapc, + ofs, + op_size, bio); + else + rbd_req_read(rq, rbd_dev, + cur_snap_id(rbd_dev), + ofs, + op_size, bio); + + size -= op_size; + ofs += op_size; + + rq_bio = next_bio; + } while (size > 0); + + if (bp) + bio_pair_release(bp); + + spin_lock_irq(q->queue_lock); +next: + rq = blk_fetch_request(q); + } +} + +/* + * a queue callback. Makes sure that we don't create a bio that spans across + * multiple osd objects. One exception would be with a single page bios, + * which we handle later at bio_chain_clone + */ +static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, + struct bio_vec *bvec) +{ + struct rbd_device *rbd_dev = q->queuedata; + unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9); + sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); + unsigned int bio_sectors = bmd->bi_size >> 9; + int max; + + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + + bio_sectors)) << 9; + if (max < 0) + max = 0; /* bio_add cannot handle a negative return */ + if (max <= bvec->bv_len && bio_sectors == 0) + return bvec->bv_len; + return max; +} + +static void rbd_free_disk(struct rbd_device *rbd_dev) +{ + struct gendisk *disk = rbd_dev->disk; + + if (!disk) + return; + + rbd_header_free(&rbd_dev->header); + + if (disk->flags & GENHD_FL_UP) + del_gendisk(disk); + if (disk->queue) + blk_cleanup_queue(disk->queue); + put_disk(disk); +} + +/* + * reload the ondisk the header + */ +static int rbd_read_header(struct rbd_device *rbd_dev, + struct rbd_image_header *header) +{ + ssize_t rc; + struct rbd_image_header_ondisk *dh; + int snap_count = 0; + u64 snap_names_len = 0; + + while (1) { + int len = sizeof(*dh) + + snap_count * sizeof(struct rbd_image_snap_ondisk) + + snap_names_len; + + rc = -ENOMEM; + dh = kmalloc(len, GFP_KERNEL); + if (!dh) + return -ENOMEM; + + rc = rbd_req_sync_read(rbd_dev, + NULL, CEPH_NOSNAP, + rbd_dev->obj_md_name, + 0, len, + (char *)dh); + if (rc < 0) + goto out_dh; + + rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL); + if (rc < 0) + goto out_dh; + + if (snap_count != header->total_snaps) { + snap_count = header->total_snaps; + snap_names_len = header->snap_names_len; + rbd_header_free(header); + kfree(dh); + continue; + } + break; + } + +out_dh: + kfree(dh); + return rc; +} + +/* + * create a snapshot + */ +static int rbd_header_add_snap(struct rbd_device *dev, + const char *snap_name, + gfp_t gfp_flags) +{ + int name_len = strlen(snap_name); + u64 new_snapid; + int ret; + void *data, *data_start, *data_end; + + /* we should create a snapshot only if we're pointing at the head */ + if (dev->cur_snap) + return -EINVAL; + + ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid, + &new_snapid); + dout("created snapid=%lld\n", new_snapid); + if (ret < 0) + return ret; + + data = kmalloc(name_len + 16, gfp_flags); + if (!data) + return -ENOMEM; + + data_start = data; + data_end = data + name_len + 16; + + ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad); + ceph_encode_64_safe(&data, data_end, new_snapid, bad); + + ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", + data_start, data - data_start); + + kfree(data_start); + + if (ret < 0) + return ret; + + dev->header.snapc->seq = new_snapid; + + return 0; +bad: + return -ERANGE; +} + +/* + * only read the first part of the ondisk header, without the snaps info + */ +static int rbd_update_snaps(struct rbd_device *rbd_dev) +{ + int ret; + struct rbd_image_header h; + u64 snap_seq; + + ret = rbd_read_header(rbd_dev, &h); + if (ret < 0) + return ret; + + down_write(&rbd_dev->header.snap_rwsem); + + snap_seq = rbd_dev->header.snapc->seq; + + kfree(rbd_dev->header.snapc); + kfree(rbd_dev->header.snap_names); + kfree(rbd_dev->header.snap_sizes); + + rbd_dev->header.total_snaps = h.total_snaps; + rbd_dev->header.snapc = h.snapc; + rbd_dev->header.snap_names = h.snap_names; + rbd_dev->header.snap_sizes = h.snap_sizes; + rbd_dev->header.snapc->seq = snap_seq; + + up_write(&rbd_dev->header.snap_rwsem); + + return 0; +} + +static int rbd_init_disk(struct rbd_device *rbd_dev) +{ + struct gendisk *disk; + struct request_queue *q; + int rc; + u64 total_size = 0; + + /* contact OSD, request size info about the object being mapped */ + rc = rbd_read_header(rbd_dev, &rbd_dev->header); + if (rc) + return rc; + + rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size); + if (rc) + return rc; + + /* create gendisk info */ + rc = -ENOMEM; + disk = alloc_disk(RBD_MINORS_PER_MAJOR); + if (!disk) + goto out; + + sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id); + disk->major = rbd_dev->major; + disk->first_minor = 0; + disk->fops = &rbd_bd_ops; + disk->private_data = rbd_dev; + + /* init rq */ + rc = -ENOMEM; + q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); + if (!q) + goto out_disk; + blk_queue_merge_bvec(q, rbd_merge_bvec); + disk->queue = q; + + q->queuedata = rbd_dev; + + rbd_dev->disk = disk; + rbd_dev->q = q; + + /* finally, announce the disk to the world */ + set_capacity(disk, total_size / 512ULL); + add_disk(disk); + + pr_info("%s: added with size 0x%llx\n", + disk->disk_name, (unsigned long long)total_size); + return 0; + +out_disk: + put_disk(disk); +out: + return rc; +} + +/******************************************************************** + * /sys/class/rbd/ + * add map rados objects to blkdev + * remove unmap rados objects + * list show mappings + *******************************************************************/ + +static void class_rbd_release(struct class *cls) +{ + kfree(cls); +} + +static ssize_t class_rbd_list(struct class *c, + struct class_attribute *attr, + char *data) +{ + int n = 0; + struct list_head *tmp; + int max = PAGE_SIZE; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + n += snprintf(data, max, + "#id\tmajor\tclient_name\tpool\tname\tsnap\tKB\n"); + + list_for_each(tmp, &rbd_dev_list) { + struct rbd_device *rbd_dev; + + rbd_dev = list_entry(tmp, struct rbd_device, node); + n += snprintf(data+n, max-n, + "%d\t%d\tclient%lld\t%s\t%s\t%s\t%lld\n", + rbd_dev->id, + rbd_dev->major, + ceph_client_id(rbd_dev->client), + rbd_dev->pool_name, + rbd_dev->obj, rbd_dev->snap_name, + rbd_dev->header.image_size >> 10); + if (n == max) + break; + } + + mutex_unlock(&ctl_mutex); + return n; +} + +static ssize_t class_rbd_add(struct class *c, + struct class_attribute *attr, + const char *buf, size_t count) +{ + struct ceph_osd_client *osdc; + struct rbd_device *rbd_dev; + ssize_t rc = -ENOMEM; + int irc, new_id = 0; + struct list_head *tmp; + char *mon_dev_name; + char *options; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); + if (!mon_dev_name) + goto err_out_mod; + + options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); + if (!options) + goto err_mon_dev; + + /* new rbd_device object */ + rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); + if (!rbd_dev) + goto err_out_opt; + + /* static rbd_device initialization */ + spin_lock_init(&rbd_dev->lock); + INIT_LIST_HEAD(&rbd_dev->node); + + /* generate unique id: find highest unique id, add one */ + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + list_for_each(tmp, &rbd_dev_list) { + struct rbd_device *rbd_dev; + + rbd_dev = list_entry(tmp, struct rbd_device, node); + if (rbd_dev->id >= new_id) + new_id = rbd_dev->id + 1; + } + + rbd_dev->id = new_id; + + /* add to global list */ + list_add_tail(&rbd_dev->node, &rbd_dev_list); + + /* parse add command */ + if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s " + "%" __stringify(RBD_MAX_OPT_LEN) "s " + "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s " + "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s" + "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", + mon_dev_name, options, rbd_dev->pool_name, + rbd_dev->obj, rbd_dev->snap_name) < 4) { + rc = -EINVAL; + goto err_out_slot; + } + + if (rbd_dev->snap_name[0] == 0) + rbd_dev->snap_name[0] = '-'; + + rbd_dev->obj_len = strlen(rbd_dev->obj); + snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s", + rbd_dev->obj, RBD_SUFFIX); + + /* initialize rest of new object */ + snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id); + rc = rbd_get_client(rbd_dev, mon_dev_name, options); + if (rc < 0) + goto err_out_slot; + + mutex_unlock(&ctl_mutex); + + /* pick the pool */ + osdc = &rbd_dev->client->osdc; + rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); + if (rc < 0) + goto err_out_client; + rbd_dev->poolid = rc; + + /* register our block device */ + irc = register_blkdev(0, rbd_dev->name); + if (irc < 0) { + rc = irc; + goto err_out_client; + } + rbd_dev->major = irc; + + /* set up and announce blkdev mapping */ + rc = rbd_init_disk(rbd_dev); + if (rc) + goto err_out_blkdev; + + return count; + +err_out_blkdev: + unregister_blkdev(rbd_dev->major, rbd_dev->name); +err_out_client: + rbd_put_client(rbd_dev); + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); +err_out_slot: + list_del_init(&rbd_dev->node); + mutex_unlock(&ctl_mutex); + + kfree(rbd_dev); +err_out_opt: + kfree(options); +err_mon_dev: + kfree(mon_dev_name); +err_out_mod: + dout("Error adding device %s\n", buf); + module_put(THIS_MODULE); + return rc; +} + +static struct rbd_device *__rbd_get_dev(unsigned long id) +{ + struct list_head *tmp; + struct rbd_device *rbd_dev; + + list_for_each(tmp, &rbd_dev_list) { + rbd_dev = list_entry(tmp, struct rbd_device, node); + if (rbd_dev->id == id) + return rbd_dev; + } + return NULL; +} + +static ssize_t class_rbd_remove(struct class *c, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + struct rbd_device *rbd_dev = NULL; + int target_id, rc; + unsigned long ul; + + rc = strict_strtoul(buf, 10, &ul); + if (rc) + return rc; + + /* convert to int; abort if we lost anything in the conversion */ + target_id = (int) ul; + if (target_id != ul) + return -EINVAL; + + /* remove object from list immediately */ + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + rbd_dev = __rbd_get_dev(target_id); + if (rbd_dev) + list_del_init(&rbd_dev->node); + + mutex_unlock(&ctl_mutex); + + if (!rbd_dev) + return -ENOENT; + + rbd_put_client(rbd_dev); + + /* clean up and free blkdev */ + rbd_free_disk(rbd_dev); + unregister_blkdev(rbd_dev->major, rbd_dev->name); + kfree(rbd_dev); + + /* release module ref */ + module_put(THIS_MODULE); + + return count; +} + +static ssize_t class_rbd_snaps_list(struct class *c, + struct class_attribute *attr, + char *data) +{ + struct rbd_device *rbd_dev = NULL; + struct list_head *tmp; + struct rbd_image_header *header; + int i, n = 0, max = PAGE_SIZE; + int ret; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + n += snprintf(data, max, "#id\tsnap\tKB\n"); + + list_for_each(tmp, &rbd_dev_list) { + char *names, *p; + struct ceph_snap_context *snapc; + + rbd_dev = list_entry(tmp, struct rbd_device, node); + header = &rbd_dev->header; + + down_read(&header->snap_rwsem); + + names = header->snap_names; + snapc = header->snapc; + + n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n", + rbd_dev->id, RBD_SNAP_HEAD_NAME, + header->image_size >> 10, + (!rbd_dev->cur_snap ? " (*)" : "")); + if (n == max) + break; + + p = names; + for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) { + n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n", + rbd_dev->id, p, header->snap_sizes[i] >> 10, + (rbd_dev->cur_snap && + (snap_index(header, i) == rbd_dev->cur_snap) ? + " (*)" : "")); + if (n == max) + break; + } + + up_read(&header->snap_rwsem); + } + + + ret = n; + mutex_unlock(&ctl_mutex); + return ret; +} + +static ssize_t class_rbd_snaps_refresh(struct class *c, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + struct rbd_device *rbd_dev = NULL; + int target_id, rc; + unsigned long ul; + int ret = count; + + rc = strict_strtoul(buf, 10, &ul); + if (rc) + return rc; + + /* convert to int; abort if we lost anything in the conversion */ + target_id = (int) ul; + if (target_id != ul) + return -EINVAL; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + rbd_dev = __rbd_get_dev(target_id); + if (!rbd_dev) { + ret = -ENOENT; + goto done; + } + + rc = rbd_update_snaps(rbd_dev); + if (rc < 0) + ret = rc; + +done: + mutex_unlock(&ctl_mutex); + return ret; +} + +static ssize_t class_rbd_snap_create(struct class *c, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + struct rbd_device *rbd_dev = NULL; + int target_id, ret; + char *name; + + name = kmalloc(RBD_MAX_SNAP_NAME_LEN + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + + /* parse snaps add command */ + if (sscanf(buf, "%d " + "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", + &target_id, + name) != 2) { + ret = -EINVAL; + goto done; + } + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + rbd_dev = __rbd_get_dev(target_id); + if (!rbd_dev) { + ret = -ENOENT; + goto done_unlock; + } + + ret = rbd_header_add_snap(rbd_dev, + name, GFP_KERNEL); + if (ret < 0) + goto done_unlock; + + ret = rbd_update_snaps(rbd_dev); + if (ret < 0) + goto done_unlock; + + ret = count; +done_unlock: + mutex_unlock(&ctl_mutex); +done: + kfree(name); + return ret; +} + +static ssize_t class_rbd_rollback(struct class *c, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + struct rbd_device *rbd_dev = NULL; + int target_id, ret; + u64 snapid; + char snap_name[RBD_MAX_SNAP_NAME_LEN]; + u64 cur_ofs; + char *seg_name; + + /* parse snaps add command */ + if (sscanf(buf, "%d " + "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", + &target_id, + snap_name) != 2) { + return -EINVAL; + } + + ret = -ENOMEM; + seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); + if (!seg_name) + return ret; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + rbd_dev = __rbd_get_dev(target_id); + if (!rbd_dev) { + ret = -ENOENT; + goto done_unlock; + } + + ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL); + if (ret < 0) + goto done_unlock; + + dout("snapid=%lld\n", snapid); + + cur_ofs = 0; + while (cur_ofs < rbd_dev->header.image_size) { + cur_ofs += rbd_get_segment(&rbd_dev->header, + rbd_dev->obj, + cur_ofs, (u64)-1, + seg_name, NULL); + dout("seg_name=%s\n", seg_name); + + ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name); + if (ret < 0) + pr_warning("could not roll back obj %s err=%d\n", + seg_name, ret); + } + + ret = rbd_update_snaps(rbd_dev); + if (ret < 0) + goto done_unlock; + + ret = count; + +done_unlock: + mutex_unlock(&ctl_mutex); + kfree(seg_name); + + return ret; +} + +static struct class_attribute class_rbd_attrs[] = { + __ATTR(add, 0200, NULL, class_rbd_add), + __ATTR(remove, 0200, NULL, class_rbd_remove), + __ATTR(list, 0444, class_rbd_list, NULL), + __ATTR(snaps_refresh, 0200, NULL, class_rbd_snaps_refresh), + __ATTR(snap_create, 0200, NULL, class_rbd_snap_create), + __ATTR(snaps_list, 0444, class_rbd_snaps_list, NULL), + __ATTR(snap_rollback, 0200, NULL, class_rbd_rollback), + __ATTR_NULL +}; + +/* + * create control files in sysfs + * /sys/class/rbd/... + */ +static int rbd_sysfs_init(void) +{ + int ret = -ENOMEM; + + class_rbd = kzalloc(sizeof(*class_rbd), GFP_KERNEL); + if (!class_rbd) + goto out; + + class_rbd->name = DRV_NAME; + class_rbd->owner = THIS_MODULE; + class_rbd->class_release = class_rbd_release; + class_rbd->class_attrs = class_rbd_attrs; + + ret = class_register(class_rbd); + if (ret) + goto out_class; + return 0; + +out_class: + kfree(class_rbd); + class_rbd = NULL; + pr_err(DRV_NAME ": failed to create class rbd\n"); +out: + return ret; +} + +static void rbd_sysfs_cleanup(void) +{ + if (class_rbd) + class_destroy(class_rbd); + class_rbd = NULL; +} + +int __init rbd_init(void) +{ + int rc; + + rc = rbd_sysfs_init(); + if (rc) + return rc; + spin_lock_init(&node_lock); + pr_info("loaded " DRV_NAME_LONG "\n"); + return 0; +} + +void __exit rbd_exit(void) +{ + rbd_sysfs_cleanup(); +} + +module_init(rbd_init); +module_exit(rbd_exit); + +MODULE_AUTHOR("Sage Weil "); +MODULE_AUTHOR("Yehuda Sadeh "); +MODULE_DESCRIPTION("rados block device"); + +/* following authorship retained from original osdblk.c */ +MODULE_AUTHOR("Jeff Garzik "); + +MODULE_LICENSE("GPL"); diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h new file mode 100644 index 000000000000..fc6c678aa2cb --- /dev/null +++ b/drivers/block/rbd_types.h @@ -0,0 +1,73 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2010 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_RBD_TYPES_H +#define CEPH_RBD_TYPES_H + +#include + +/* + * rbd image 'foo' consists of objects + * foo.rbd - image metadata + * foo.00000000 + * foo.00000001 + * ... - data + */ + +#define RBD_SUFFIX ".rbd" +#define RBD_DIRECTORY "rbd_directory" +#define RBD_INFO "rbd_info" + +#define RBD_DEFAULT_OBJ_ORDER 22 /* 4MB */ +#define RBD_MIN_OBJ_ORDER 16 +#define RBD_MAX_OBJ_ORDER 30 + +#define RBD_MAX_OBJ_NAME_LEN 96 +#define RBD_MAX_SEG_NAME_LEN 128 + +#define RBD_COMP_NONE 0 +#define RBD_CRYPT_NONE 0 + +#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n" +#define RBD_HEADER_SIGNATURE "RBD" +#define RBD_HEADER_VERSION "001.005" + +struct rbd_info { + __le64 max_id; +} __attribute__ ((packed)); + +struct rbd_image_snap_ondisk { + __le64 id; + __le64 image_size; +} __attribute__((packed)); + +struct rbd_image_header_ondisk { + char text[40]; + char block_name[24]; + char signature[4]; + char version[8]; + struct { + __u8 order; + __u8 crypt_type; + __u8 comp_type; + __u8 unused; + } __attribute__((packed)) options; + __le64 image_size; + __le64 snap_seq; + __le32 snap_count; + __le32 reserved; + __le64 snap_names_len; + struct rbd_image_snap_ondisk snaps[0]; +} __attribute__((packed)); + + +#endif diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 0fcd2640c23f..9eb134ea6eb2 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -1,9 +1,11 @@ config CEPH_FS tristate "Ceph distributed file system (EXPERIMENTAL)" depends on INET && EXPERIMENTAL + select CEPH_LIB select LIBCRC32C select CRYPTO_AES select CRYPTO + default n help Choose Y or M here to include support for mounting the experimental Ceph distributed file system. Ceph is an extremely @@ -14,15 +16,3 @@ config CEPH_FS If unsure, say N. -config CEPH_FS_PRETTYDEBUG - bool "Include file:line in ceph debug output" - depends on CEPH_FS - default n - help - If you say Y here, debug output will include a filename and - line to aid debugging. This icnreases kernel size and slows - execution slightly when debug call sites are enabled (e.g., - via CONFIG_DYNAMIC_DEBUG). - - If unsure, say N. - diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 278e1172600d..9e6c4f2e8ff1 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -8,15 +8,8 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ export.o caps.o snap.o xattr.o \ - messenger.o msgpool.o buffer.o pagelist.o \ - mds_client.o mdsmap.o \ - mon_client.o \ - osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ - debugfs.o \ - auth.o auth_none.o \ - crypto.o armor.o \ - auth_x.o \ - ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o + mds_client.o mdsmap.o strings.o ceph_frag.o \ + debugfs.o else #Otherwise we were called directly from the command diff --git a/fs/ceph/README b/fs/ceph/README deleted file mode 100644 index 18352fab37c0..000000000000 --- a/fs/ceph/README +++ /dev/null @@ -1,20 +0,0 @@ -# -# The following files are shared by (and manually synchronized -# between) the Ceph userland and kernel client. -# -# userland kernel -src/include/ceph_fs.h fs/ceph/ceph_fs.h -src/include/ceph_fs.cc fs/ceph/ceph_fs.c -src/include/msgr.h fs/ceph/msgr.h -src/include/rados.h fs/ceph/rados.h -src/include/ceph_strings.cc fs/ceph/ceph_strings.c -src/include/ceph_frag.h fs/ceph/ceph_frag.h -src/include/ceph_frag.cc fs/ceph/ceph_frag.c -src/include/ceph_hash.h fs/ceph/ceph_hash.h -src/include/ceph_hash.cc fs/ceph/ceph_hash.c -src/crush/crush.c fs/ceph/crush/crush.c -src/crush/crush.h fs/ceph/crush/crush.h -src/crush/mapper.c fs/ceph/crush/mapper.c -src/crush/mapper.h fs/ceph/crush/mapper.h -src/crush/hash.h fs/ceph/crush/hash.h -src/crush/hash.c fs/ceph/crush/hash.c diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index efbc604001c8..51bcc5ce3230 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include #include #include @@ -10,7 +10,8 @@ #include #include "super.h" -#include "osd_client.h" +#include "mds_client.h" +#include /* * Ceph address space ops. @@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page) { struct inode *inode = filp->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; int err = 0; u64 len = PAGE_CACHE_SIZE; @@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, { struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; int rc = 0; struct page **pages; loff_t offset; @@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) { struct inode *inode; struct ceph_inode_info *ci; - struct ceph_client *client; + struct ceph_fs_client *fsc; struct ceph_osd_client *osdc; loff_t page_off = page->index << PAGE_CACHE_SHIFT; int len = PAGE_CACHE_SIZE; @@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } inode = page->mapping->host; ci = ceph_inode(inode); - client = ceph_inode_to_client(inode); - osdc = &client->osdc; + fsc = ceph_inode_to_client(inode); + osdc = &fsc->client->osdc; /* verify this is a writeable snap context */ snapc = (void *)page->private; @@ -414,10 +417,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", inode, page, page->index, page_off, len, snapc); - writeback_stat = atomic_long_inc_return(&client->writeback_count); + writeback_stat = atomic_long_inc_return(&fsc->writeback_count); if (writeback_stat > - CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) - set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); + CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) + set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), @@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req, struct address_space *mapping = inode->i_mapping; __s32 rc = -EIO; u64 bytes = 0; - struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); long writeback_stat; unsigned issued = ceph_caps_issued(ci); @@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req, WARN_ON(!PageUptodate(page)); writeback_stat = - atomic_long_dec_return(&client->writeback_count); + atomic_long_dec_return(&fsc->writeback_count); if (writeback_stat < - CONGESTION_OFF_THRESH(client->mount_args->congestion_kb)) - clear_bdi_congested(&client->backing_dev_info, + CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) + clear_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); ceph_put_snap_context((void *)page->private); @@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req, * mempool. we avoid the mempool if we can because req->r_num_pages * may be less than the maximum write size. */ -static void alloc_page_vec(struct ceph_client *client, +static void alloc_page_vec(struct ceph_fs_client *fsc, struct ceph_osd_request *req) { req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, GFP_NOFS); if (!req->r_pages) { - req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS); + req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); req->r_pages_from_pool = 1; WARN_ON(!req->r_pages); } @@ -590,7 +593,7 @@ static int ceph_writepages_start(struct address_space *mapping, struct inode *inode = mapping->host; struct backing_dev_info *bdi = mapping->backing_dev_info; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_client *client; + struct ceph_fs_client *fsc; pgoff_t index, start, end; int range_whole = 0; int should_loop = 1; @@ -617,13 +620,13 @@ static int ceph_writepages_start(struct address_space *mapping, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - client = ceph_inode_to_client(inode); - if (client->mount_state == CEPH_MOUNT_SHUTDOWN) { + fsc = ceph_inode_to_client(inode); + if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { pr_warning("writepage_start %p on forced umount\n", inode); return -EIO; /* we're in a forced umount, don't write! */ } - if (client->mount_args->wsize && client->mount_args->wsize < wsize) - wsize = client->mount_args->wsize; + if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) + wsize = fsc->mount_options->wsize; if (wsize < PAGE_CACHE_SIZE) wsize = PAGE_CACHE_SIZE; max_pages_ever = wsize >> PAGE_CACHE_SHIFT; @@ -769,7 +772,7 @@ get_more_pages: offset = (unsigned long long)page->index << PAGE_CACHE_SHIFT; len = wsize; - req = ceph_osdc_new_request(&client->osdc, + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), offset, &len, @@ -782,7 +785,7 @@ get_more_pages: &inode->i_mtime, true, 1); max_pages = req->r_num_pages; - alloc_page_vec(client, req); + alloc_page_vec(fsc, req); req->r_callback = writepages_finish; req->r_inode = inode; } @@ -794,10 +797,10 @@ get_more_pages: inode, page, page->index); writeback_stat = - atomic_long_inc_return(&client->writeback_count); + atomic_long_inc_return(&fsc->writeback_count); if (writeback_stat > CONGESTION_ON_THRESH( - client->mount_args->congestion_kb)) { - set_bdi_congested(&client->backing_dev_info, + fsc->mount_options->congestion_kb)) { + set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); } @@ -846,7 +849,7 @@ get_more_pages: op->payload_len = cpu_to_le32(len); req->r_request->hdr.data_len = cpu_to_le32(len); - ceph_osdc_start_request(&client->osdc, req, true); + ceph_osdc_start_request(&fsc->client->osdc, req, true); req = NULL; /* continue? */ @@ -915,7 +918,7 @@ static int ceph_update_writeable_page(struct file *file, { struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; loff_t page_off = pos & PAGE_CACHE_MASK; int pos_in_page = pos & ~PAGE_CACHE_MASK; int end_in_page = pos_in_page + len; @@ -1053,8 +1056,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file->f_dentry->d_inode; - struct ceph_client *client = ceph_inode_to_client(inode); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = fsc->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; @@ -1123,7 +1126,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page = vmf->page; - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; loff_t off = page->index << PAGE_CACHE_SHIFT; loff_t size, len; int ret; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5e9da996a151..98ab13e2b71d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include #include #include @@ -9,8 +9,9 @@ #include #include "super.h" -#include "decode.h" -#include "messenger.h" +#include "mds_client.h" +#include +#include /* * Capability management @@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) spin_unlock(&mdsc->caps_list_lock); } -void ceph_reservation_status(struct ceph_client *client, +void ceph_reservation_status(struct ceph_fs_client *fsc, int *total, int *avail, int *used, int *reserved, int *min) { - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_client *mdsc = fsc->mdsc; if (total) *total = mdsc->caps_total_count; @@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci, static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - struct ceph_mount_args *ma = mdsc->client->mount_args; + struct ceph_mount_options *ma = mdsc->fsc->mount_options; ci->i_hold_caps_min = round_jiffies(jiffies + ma->caps_wanted_delay_min * HZ); @@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode, unsigned seq, unsigned mseq, u64 realmino, int flags, struct ceph_cap_reservation *caps_reservation) { - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *new_cap = NULL; struct ceph_cap *cap; @@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap) struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; struct ceph_mds_client *mdsc = - &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; int removed = 0; dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); @@ -1210,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci, int mds; struct ceph_cap_snap *capsnap; u32 mseq; - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_session *session = NULL; /* if session != NULL, we hold session->s_mutex */ u64 next_follows = 0; /* keep track of how far we've gotten through the @@ -1336,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) { struct ceph_mds_client *mdsc = - &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct inode *inode = &ci->vfs_inode; int was = ci->i_dirty_caps; int dirty = 0; @@ -1378,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) static int __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session) { - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int flushing; @@ -1416,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode, /* * try to invalidate mapping pages without blocking. */ -static int mapping_is_empty(struct address_space *mapping) -{ - struct page *page = find_get_page(mapping, 0); - - if (!page) - return 1; - - put_page(page); - return 0; -} - static int try_nonblocking_invalidate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); @@ -1436,7 +1426,7 @@ static int try_nonblocking_invalidate(struct inode *inode) invalidate_mapping_pages(&inode->i_data, 0, -1); spin_lock(&inode->i_lock); - if (mapping_is_empty(&inode->i_data) && + if (inode->i_data.nrpages == 0 && invalidating_gen == ci->i_rdcache_gen) { /* success. */ dout("try_nonblocking_invalidate %p success\n", inode); @@ -1462,8 +1452,8 @@ static int try_nonblocking_invalidate(struct inode *inode) void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session) { - struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); + struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; int file_wanted, used; @@ -1533,7 +1523,7 @@ retry_locked: */ if ((!is_delayed || mdsc->stopping) && ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ - ci->i_rdcache_gen && /* may have cached pages */ + inode->i_data.nrpages && /* have cached pages */ (file_wanted == 0 || /* no open files */ (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ @@ -1706,7 +1696,7 @@ ack: static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, unsigned *flush_tid) { - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int unlock_session = session ? 0 : 1; int flushing = 0; @@ -1872,7 +1862,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) caps_are_flushed(inode, flush_tid)); } else { struct ceph_mds_client *mdsc = - &ceph_sb_to_client(inode->i_sb)->mdsc; + ceph_sb_to_client(inode->i_sb)->mdsc; spin_lock(&inode->i_lock); if (__ceph_caps_dirty(ci)) @@ -2465,7 +2455,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, __releases(inode->i_lock) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; @@ -2713,7 +2703,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; - struct super_block *sb = mdsc->client->sb; + struct super_block *sb = mdsc->fsc->sb; struct inode *inode; struct ceph_cap *cap; struct ceph_mds_caps *h; diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c index ab6cf35c4091..bdce8b1fbd06 100644 --- a/fs/ceph/ceph_frag.c +++ b/fs/ceph/ceph_frag.c @@ -1,7 +1,8 @@ /* * Ceph 'frag' type */ -#include "types.h" +#include +#include int ceph_frag_compare(__u32 a, __u32 b) { diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 6fd8b20a8611..7ae1b3d55b58 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include #include #include @@ -7,143 +7,49 @@ #include #include +#include +#include +#include +#include + #include "super.h" -#include "mds_client.h" -#include "mon_client.h" -#include "auth.h" #ifdef CONFIG_DEBUG_FS -/* - * Implement /sys/kernel/debug/ceph fun - * - * /sys/kernel/debug/ceph/client* - an instance of the ceph client - * .../osdmap - current osdmap - * .../mdsmap - current mdsmap - * .../monmap - current monmap - * .../osdc - active osd requests - * .../mdsc - active mds requests - * .../monc - mon client state - * .../dentry_lru - dump contents of dentry lru - * .../caps - expose cap (reservation) stats - * .../bdi - symlink to ../../bdi/something - */ - -static struct dentry *ceph_debugfs_dir; - -static int monmap_show(struct seq_file *s, void *p) -{ - int i; - struct ceph_client *client = s->private; - - if (client->monc.monmap == NULL) - return 0; - - seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); - for (i = 0; i < client->monc.monmap->num_mon; i++) { - struct ceph_entity_inst *inst = - &client->monc.monmap->mon_inst[i]; - - seq_printf(s, "\t%s%lld\t%s\n", - ENTITY_NAME(inst->name), - pr_addr(&inst->addr.in_addr)); - } - return 0; -} +#include "mds_client.h" static int mdsmap_show(struct seq_file *s, void *p) { int i; - struct ceph_client *client = s->private; + struct ceph_fs_client *fsc = s->private; - if (client->mdsc.mdsmap == NULL) + if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) return 0; - seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch); - seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root); + seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch); + seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root); seq_printf(s, "session_timeout %d\n", - client->mdsc.mdsmap->m_session_timeout); + fsc->mdsc->mdsmap->m_session_timeout); seq_printf(s, "session_autoclose %d\n", - client->mdsc.mdsmap->m_session_autoclose); - for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) { + fsc->mdsc->mdsmap->m_session_autoclose); + for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) { struct ceph_entity_addr *addr = - &client->mdsc.mdsmap->m_info[i].addr; - int state = client->mdsc.mdsmap->m_info[i].state; + &fsc->mdsc->mdsmap->m_info[i].addr; + int state = fsc->mdsc->mdsmap->m_info[i].state; - seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr), + seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, + ceph_pr_addr(&addr->in_addr), ceph_mds_state_name(state)); } return 0; } -static int osdmap_show(struct seq_file *s, void *p) -{ - int i; - struct ceph_client *client = s->private; - struct rb_node *n; - - if (client->osdc.osdmap == NULL) - return 0; - seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); - seq_printf(s, "flags%s%s\n", - (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? - " NEARFULL" : "", - (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? - " FULL" : ""); - for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { - struct ceph_pg_pool_info *pool = - rb_entry(n, struct ceph_pg_pool_info, node); - seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", - pool->id, pool->v.pg_num, pool->pg_num_mask, - pool->v.lpg_num, pool->lpg_num_mask); - } - for (i = 0; i < client->osdc.osdmap->max_osd; i++) { - struct ceph_entity_addr *addr = - &client->osdc.osdmap->osd_addr[i]; - int state = client->osdc.osdmap->osd_state[i]; - char sb[64]; - - seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", - i, pr_addr(&addr->in_addr), - ((client->osdc.osdmap->osd_weight[i]*100) >> 16), - ceph_osdmap_state_str(sb, sizeof(sb), state)); - } - return 0; -} - -static int monc_show(struct seq_file *s, void *p) -{ - struct ceph_client *client = s->private; - struct ceph_mon_generic_request *req; - struct ceph_mon_client *monc = &client->monc; - struct rb_node *rp; - - mutex_lock(&monc->mutex); - - if (monc->have_mdsmap) - seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap); - if (monc->have_osdmap) - seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap); - if (monc->want_next_osdmap) - seq_printf(s, "want next osdmap\n"); - - for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { - __u16 op; - req = rb_entry(rp, struct ceph_mon_generic_request, node); - op = le16_to_cpu(req->request->hdr.type); - if (op == CEPH_MSG_STATFS) - seq_printf(s, "%lld statfs\n", req->tid); - else - seq_printf(s, "%lld unknown\n", req->tid); - } - - mutex_unlock(&monc->mutex); - return 0; -} - +/* + * mdsc debugfs + */ static int mdsc_show(struct seq_file *s, void *p) { - struct ceph_client *client = s->private; - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct rb_node *rp; int pathlen; @@ -214,61 +120,12 @@ static int mdsc_show(struct seq_file *s, void *p) return 0; } -static int osdc_show(struct seq_file *s, void *pp) -{ - struct ceph_client *client = s->private; - struct ceph_osd_client *osdc = &client->osdc; - struct rb_node *p; - - mutex_lock(&osdc->request_mutex); - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { - struct ceph_osd_request *req; - struct ceph_osd_request_head *head; - struct ceph_osd_op *op; - int num_ops; - int opcode, olen; - int i; - - req = rb_entry(p, struct ceph_osd_request, r_node); - - seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1, - le32_to_cpu(req->r_pgid.pool), - le16_to_cpu(req->r_pgid.ps)); - - head = req->r_request->front.iov_base; - op = (void *)(head + 1); - - num_ops = le16_to_cpu(head->num_ops); - olen = le32_to_cpu(head->object_len); - seq_printf(s, "%.*s", olen, - (const char *)(head->ops + num_ops)); - - if (req->r_reassert_version.epoch) - seq_printf(s, "\t%u'%llu", - (unsigned)le32_to_cpu(req->r_reassert_version.epoch), - le64_to_cpu(req->r_reassert_version.version)); - else - seq_printf(s, "\t"); - - for (i = 0; i < num_ops; i++) { - opcode = le16_to_cpu(op->op); - seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); - op++; - } - - seq_printf(s, "\n"); - } - mutex_unlock(&osdc->request_mutex); - return 0; -} - static int caps_show(struct seq_file *s, void *p) { - struct ceph_client *client = s->private; + struct ceph_fs_client *fsc = s->private; int total, avail, used, reserved, min; - ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); + ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); seq_printf(s, "total\t\t%d\n" "avail\t\t%d\n" "used\t\t%d\n" @@ -280,8 +137,8 @@ static int caps_show(struct seq_file *s, void *p) static int dentry_lru_show(struct seq_file *s, void *ptr) { - struct ceph_client *client = s->private; - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_dentry_info *di; spin_lock(&mdsc->dentry_lru_lock); @@ -295,199 +152,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr) return 0; } -#define DEFINE_SHOW_FUNC(name) \ -static int name##_open(struct inode *inode, struct file *file) \ -{ \ - struct seq_file *sf; \ - int ret; \ - \ - ret = single_open(file, name, NULL); \ - sf = file->private_data; \ - sf->private = inode->i_private; \ - return ret; \ -} \ - \ -static const struct file_operations name##_fops = { \ - .open = name##_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; +CEPH_DEFINE_SHOW_FUNC(mdsmap_show) +CEPH_DEFINE_SHOW_FUNC(mdsc_show) +CEPH_DEFINE_SHOW_FUNC(caps_show) +CEPH_DEFINE_SHOW_FUNC(dentry_lru_show) -DEFINE_SHOW_FUNC(monmap_show) -DEFINE_SHOW_FUNC(mdsmap_show) -DEFINE_SHOW_FUNC(osdmap_show) -DEFINE_SHOW_FUNC(monc_show) -DEFINE_SHOW_FUNC(mdsc_show) -DEFINE_SHOW_FUNC(osdc_show) -DEFINE_SHOW_FUNC(dentry_lru_show) -DEFINE_SHOW_FUNC(caps_show) +/* + * debugfs + */ static int congestion_kb_set(void *data, u64 val) { - struct ceph_client *client = (struct ceph_client *)data; - - if (client) - client->mount_args->congestion_kb = (int)val; + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; + fsc->mount_options->congestion_kb = (int)val; return 0; } static int congestion_kb_get(void *data, u64 *val) { - struct ceph_client *client = (struct ceph_client *)data; - - if (client) - *val = (u64)client->mount_args->congestion_kb; + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; + *val = (u64)fsc->mount_options->congestion_kb; return 0; } - DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, congestion_kb_set, "%llu\n"); -int __init ceph_debugfs_init(void) + +void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) { - ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); - if (!ceph_debugfs_dir) - return -ENOMEM; - return 0; + dout("ceph_fs_debugfs_cleanup\n"); + debugfs_remove(fsc->debugfs_bdi); + debugfs_remove(fsc->debugfs_congestion_kb); + debugfs_remove(fsc->debugfs_mdsmap); + debugfs_remove(fsc->debugfs_caps); + debugfs_remove(fsc->debugfs_mdsc); + debugfs_remove(fsc->debugfs_dentry_lru); } -void ceph_debugfs_cleanup(void) +int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { - debugfs_remove(ceph_debugfs_dir); -} + char name[100]; + int err = -ENOMEM; -int ceph_debugfs_client_init(struct ceph_client *client) -{ - int ret = 0; - char name[80]; - - snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, - client->monc.auth->global_id); - - client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); - if (!client->debugfs_dir) - goto out; - - client->monc.debugfs_file = debugfs_create_file("monc", - 0600, - client->debugfs_dir, - client, - &monc_show_fops); - if (!client->monc.debugfs_file) - goto out; - - client->mdsc.debugfs_file = debugfs_create_file("mdsc", - 0600, - client->debugfs_dir, - client, - &mdsc_show_fops); - if (!client->mdsc.debugfs_file) - goto out; - - client->osdc.debugfs_file = debugfs_create_file("osdc", - 0600, - client->debugfs_dir, - client, - &osdc_show_fops); - if (!client->osdc.debugfs_file) - goto out; - - client->debugfs_monmap = debugfs_create_file("monmap", - 0600, - client->debugfs_dir, - client, - &monmap_show_fops); - if (!client->debugfs_monmap) - goto out; - - client->debugfs_mdsmap = debugfs_create_file("mdsmap", - 0600, - client->debugfs_dir, - client, - &mdsmap_show_fops); - if (!client->debugfs_mdsmap) - goto out; - - client->debugfs_osdmap = debugfs_create_file("osdmap", - 0600, - client->debugfs_dir, - client, - &osdmap_show_fops); - if (!client->debugfs_osdmap) - goto out; - - client->debugfs_dentry_lru = debugfs_create_file("dentry_lru", - 0600, - client->debugfs_dir, - client, - &dentry_lru_show_fops); - if (!client->debugfs_dentry_lru) - goto out; - - client->debugfs_caps = debugfs_create_file("caps", - 0400, - client->debugfs_dir, - client, - &caps_show_fops); - if (!client->debugfs_caps) - goto out; - - client->debugfs_congestion_kb = + dout("ceph_fs_debugfs_init\n"); + fsc->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", 0600, - client->debugfs_dir, - client, + fsc->client->debugfs_dir, + fsc, &congestion_kb_fops); - if (!client->debugfs_congestion_kb) + if (!fsc->debugfs_congestion_kb) goto out; - sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev)); - client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir, - name); + dout("a\n"); + + snprintf(name, sizeof(name), "../../bdi/%s", + dev_name(fsc->backing_dev_info.dev)); + fsc->debugfs_bdi = + debugfs_create_symlink("bdi", + fsc->client->debugfs_dir, + name); + if (!fsc->debugfs_bdi) + goto out; + + dout("b\n"); + fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", + 0600, + fsc->client->debugfs_dir, + fsc, + &mdsmap_show_fops); + if (!fsc->debugfs_mdsmap) + goto out; + + dout("ca\n"); + fsc->debugfs_mdsc = debugfs_create_file("mdsc", + 0600, + fsc->client->debugfs_dir, + fsc, + &mdsc_show_fops); + if (!fsc->debugfs_mdsc) + goto out; + + dout("da\n"); + fsc->debugfs_caps = debugfs_create_file("caps", + 0400, + fsc->client->debugfs_dir, + fsc, + &caps_show_fops); + if (!fsc->debugfs_caps) + goto out; + + dout("ea\n"); + fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", + 0600, + fsc->client->debugfs_dir, + fsc, + &dentry_lru_show_fops); + if (!fsc->debugfs_dentry_lru) + goto out; return 0; out: - ceph_debugfs_client_cleanup(client); - return ret; + ceph_fs_debugfs_cleanup(fsc); + return err; } -void ceph_debugfs_client_cleanup(struct ceph_client *client) -{ - debugfs_remove(client->debugfs_bdi); - debugfs_remove(client->debugfs_caps); - debugfs_remove(client->debugfs_dentry_lru); - debugfs_remove(client->debugfs_osdmap); - debugfs_remove(client->debugfs_mdsmap); - debugfs_remove(client->debugfs_monmap); - debugfs_remove(client->osdc.debugfs_file); - debugfs_remove(client->mdsc.debugfs_file); - debugfs_remove(client->monc.debugfs_file); - debugfs_remove(client->debugfs_congestion_kb); - debugfs_remove(client->debugfs_dir); -} #else /* CONFIG_DEBUG_FS */ -int __init ceph_debugfs_init(void) +int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { return 0; } -void ceph_debugfs_cleanup(void) -{ -} - -int ceph_debugfs_client_init(struct ceph_client *client) -{ - return 0; -} - -void ceph_debugfs_client_cleanup(struct ceph_client *client) +void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) { } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index a1986eb52045..e0a2dc6fcafc 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include #include #include @@ -7,6 +7,7 @@ #include #include "super.h" +#include "mds_client.h" /* * Directory operations: readdir, lookup, create, link, unlink, @@ -94,10 +95,7 @@ static unsigned fpos_off(loff_t p) */ static int __dcache_readdir(struct file *filp, void *dirent, filldir_t filldir) - __releases(inode->i_lock) - __acquires(inode->i_lock) { - struct inode *inode = filp->f_dentry->d_inode; struct ceph_file_info *fi = filp->private_data; struct dentry *parent = filp->f_dentry; struct inode *dir = parent->d_inode; @@ -153,7 +151,6 @@ more: atomic_inc(&dentry->d_count); spin_unlock(&dcache_lock); - spin_unlock(&inode->i_lock); dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); @@ -171,35 +168,30 @@ more: } else { dput(last); } - last = NULL; } - - spin_lock(&inode->i_lock); - spin_lock(&dcache_lock); - last = dentry; if (err < 0) - goto out_unlock; + goto out; - p = p->prev; filp->f_pos++; /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ - if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE)) - goto more; - dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); - err = -EAGAIN; + if (!ceph_i_test(dir, CEPH_I_COMPLETE)) { + dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); + err = -EAGAIN; + goto out; + } + + spin_lock(&dcache_lock); + p = p->prev; /* advance to next dentry */ + goto more; out_unlock: spin_unlock(&dcache_lock); - - if (last) { - spin_unlock(&inode->i_lock); +out: + if (last) dput(last); - spin_lock(&inode->i_lock); - } - return err; } @@ -227,15 +219,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) struct ceph_file_info *fi = filp->private_data; struct inode *inode = filp->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_client *client = ceph_inode_to_client(inode); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = fsc->mdsc; unsigned frag = fpos_frag(filp->f_pos); int off = fpos_off(filp->f_pos); int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; - const int max_entries = client->mount_args->max_readdir; - const int max_bytes = client->mount_args->max_readdir_bytes; + const int max_entries = fsc->mount_options->max_readdir; + const int max_bytes = fsc->mount_options->max_readdir_bytes; dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); if (fi->at_end) @@ -267,17 +259,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) /* can we use the dcache? */ spin_lock(&inode->i_lock); if ((filp->f_pos == 2 || fi->dentry) && - !ceph_test_opt(client, NOASYNCREADDIR) && + !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && (ci->i_ceph_flags & CEPH_I_COMPLETE) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { + spin_unlock(&inode->i_lock); err = __dcache_readdir(filp, dirent, filldir); - if (err != -EAGAIN) { - spin_unlock(&inode->i_lock); + if (err != -EAGAIN) return err; - } + } else { + spin_unlock(&inode->i_lock); } - spin_unlock(&inode->i_lock); if (fi->dentry) { err = note_last_dentry(fi, fi->dentry->d_name.name, fi->dentry->d_name.len); @@ -487,14 +479,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err) { - struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *parent = dentry->d_parent->d_inode; /* .snap dir? */ if (err == -ENOENT && - ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */ strcmp(dentry->d_name.name, - client->mount_args->snapdir_name) == 0) { + fsc->mount_options->snapdir_name) == 0) { struct inode *inode = ceph_get_snapdir(parent); dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", dentry, dentry->d_name.len, dentry->d_name.name, inode); @@ -539,8 +530,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int op; int err; @@ -572,7 +563,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, spin_lock(&dir->i_lock); dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); if (strncmp(dentry->d_name.name, - client->mount_args->snapdir_name, + fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && (ci->i_ceph_flags & CEPH_I_COMPLETE) && @@ -629,8 +620,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) static int ceph_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; @@ -685,8 +676,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode, static int ceph_symlink(struct inode *dir, struct dentry *dentry, const char *dest) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; @@ -716,8 +707,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err = -EROFS; int op; @@ -758,8 +749,8 @@ out: static int ceph_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; @@ -813,8 +804,8 @@ static int drop_caps_for_unlink(struct inode *inode) */ static int ceph_unlink(struct inode *dir, struct dentry *dentry) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; struct ceph_mds_request *req; int err = -EROFS; @@ -854,8 +845,8 @@ out: static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; @@ -1076,7 +1067,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, struct ceph_inode_info *ci = ceph_inode(inode); int left; - if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) + if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) return -EISDIR; if (!cf->dir_info) { @@ -1177,7 +1168,7 @@ void ceph_dentry_lru_add(struct dentry *dn) dout("dentry_lru_add %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); if (di) { - mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_add_tail(&di->lru, &mdsc->dentry_lru); mdsc->num_dentry++; @@ -1193,7 +1184,7 @@ void ceph_dentry_lru_touch(struct dentry *dn) dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, dn->d_name.len, dn->d_name.name, di->offset); if (di) { - mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_move_tail(&di->lru, &mdsc->dentry_lru); spin_unlock(&mdsc->dentry_lru_lock); @@ -1208,7 +1199,7 @@ void ceph_dentry_lru_del(struct dentry *dn) dout("dentry_lru_del %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); if (di) { - mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_del_init(&di->lru); mdsc->num_dentry--; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e38423e82f2e..2297d9426992 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -1,10 +1,11 @@ -#include "ceph_debug.h" +#include #include #include #include #include "super.h" +#include "mds_client.h" /* * NFS export support @@ -120,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, static struct dentry *__cfh_to_dentry(struct super_block *sb, struct ceph_nfs_confh *cfh) { - struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; struct inode *inode; struct dentry *dentry; struct ceph_vino vino; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 66e4da6dba22..e77c28cf3690 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1,5 +1,6 @@ -#include "ceph_debug.h" +#include +#include #include #include #include @@ -38,8 +39,8 @@ static struct ceph_mds_request * prepare_open_request(struct super_block *sb, int flags, int create_mode) { - struct ceph_client *client = ceph_sb_to_client(sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int want_auth = USE_ANY_MDS; int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; @@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) int ceph_open(struct inode *inode, struct file *file) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_client *client = ceph_sb_to_client(inode->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct ceph_file_info *cf = file->private_data; struct inode *parent_inode = file->f_dentry->d_parent->d_inode; @@ -216,8 +217,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd, int mode, int locked_dir) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct file *file = nd->intent.open.file; struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); struct ceph_mds_request *req; @@ -269,163 +270,6 @@ int ceph_release(struct inode *inode, struct file *file) return 0; } -/* - * build a vector of user pages - */ -static struct page **get_direct_page_vector(const char __user *data, - int num_pages, - loff_t off, size_t len) -{ - struct page **pages; - int rc; - - pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); - if (!pages) - return ERR_PTR(-ENOMEM); - - down_read(¤t->mm->mmap_sem); - rc = get_user_pages(current, current->mm, (unsigned long)data, - num_pages, 0, 0, pages, NULL); - up_read(¤t->mm->mmap_sem); - if (rc < 0) - goto fail; - return pages; - -fail: - kfree(pages); - return ERR_PTR(rc); -} - -static void put_page_vector(struct page **pages, int num_pages) -{ - int i; - - for (i = 0; i < num_pages; i++) - put_page(pages[i]); - kfree(pages); -} - -void ceph_release_page_vector(struct page **pages, int num_pages) -{ - int i; - - for (i = 0; i < num_pages; i++) - __free_pages(pages[i], 0); - kfree(pages); -} - -/* - * allocate a vector new pages - */ -static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) -{ - struct page **pages; - int i; - - pages = kmalloc(sizeof(*pages) * num_pages, flags); - if (!pages) - return ERR_PTR(-ENOMEM); - for (i = 0; i < num_pages; i++) { - pages[i] = __page_cache_alloc(flags); - if (pages[i] == NULL) { - ceph_release_page_vector(pages, i); - return ERR_PTR(-ENOMEM); - } - } - return pages; -} - -/* - * copy user data into a page vector - */ -static int copy_user_to_page_vector(struct page **pages, - const char __user *data, - loff_t off, size_t len) -{ - int i = 0; - int po = off & ~PAGE_CACHE_MASK; - int left = len; - int l, bad; - - while (left > 0) { - l = min_t(int, PAGE_CACHE_SIZE-po, left); - bad = copy_from_user(page_address(pages[i]) + po, data, l); - if (bad == l) - return -EFAULT; - data += l - bad; - left -= l - bad; - po += l - bad; - if (po == PAGE_CACHE_SIZE) { - po = 0; - i++; - } - } - return len; -} - -/* - * copy user data from a page vector into a user pointer - */ -static int copy_page_vector_to_user(struct page **pages, char __user *data, - loff_t off, size_t len) -{ - int i = 0; - int po = off & ~PAGE_CACHE_MASK; - int left = len; - int l, bad; - - while (left > 0) { - l = min_t(int, left, PAGE_CACHE_SIZE-po); - bad = copy_to_user(data, page_address(pages[i]) + po, l); - if (bad == l) - return -EFAULT; - data += l - bad; - left -= l - bad; - if (po) { - po += l - bad; - if (po == PAGE_CACHE_SIZE) - po = 0; - } - i++; - } - return len; -} - -/* - * Zero an extent within a page vector. Offset is relative to the - * start of the first page. - */ -static void zero_page_vector_range(int off, int len, struct page **pages) -{ - int i = off >> PAGE_CACHE_SHIFT; - - off &= ~PAGE_CACHE_MASK; - - dout("zero_page_vector_page %u~%u\n", off, len); - - /* leading partial page? */ - if (off) { - int end = min((int)PAGE_CACHE_SIZE, off + len); - dout("zeroing %d %p head from %d\n", i, pages[i], - (int)off); - zero_user_segment(pages[i], off, end); - len -= (end - off); - i++; - } - while (len >= PAGE_CACHE_SIZE) { - dout("zeroing %d %p len=%d\n", i, pages[i], len); - zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); - len -= PAGE_CACHE_SIZE; - i++; - } - /* trailing partial page? */ - if (len) { - dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len); - zero_user_segment(pages[i], 0, len); - } -} - - /* * Read a range of bytes striped over one or more objects. Iterate over * objects we stripe over. (That's not atomic, but good enough for now.) @@ -438,7 +282,7 @@ static int striped_read(struct inode *inode, struct page **pages, int num_pages, int *checkeof) { - struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len; int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ @@ -459,7 +303,7 @@ static int striped_read(struct inode *inode, more: this_len = left; - ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode), + ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, ci->i_truncate_seq, ci->i_truncate_size, @@ -477,8 +321,8 @@ more: if (read < pos - off) { dout(" zero gap %llu to %llu\n", off + read, pos); - zero_page_vector_range(page_off + read, - pos - off - read, pages); + ceph_zero_page_vector_range(page_off + read, + pos - off - read, pages); } pos += ret; read = pos - off; @@ -495,8 +339,8 @@ more: /* was original extent fully inside i_size? */ if (pos + left <= inode->i_size) { dout("zero tail\n"); - zero_page_vector_range(page_off + read, len - read, - pages); + ceph_zero_page_vector_range(page_off + read, len - read, + pages); read = len; goto out; } @@ -531,7 +375,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); if (file->f_flags & O_DIRECT) { - pages = get_direct_page_vector(data, num_pages, off, len); + pages = ceph_get_direct_page_vector(data, num_pages, off, len); /* * flush any page cache pages in this range. this @@ -552,13 +396,13 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, ret = striped_read(inode, off, len, pages, num_pages, checkeof); if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) - ret = copy_page_vector_to_user(pages, data, off, ret); + ret = ceph_copy_page_vector_to_user(pages, data, off, ret); if (ret >= 0) *poff = off + ret; done: if (file->f_flags & O_DIRECT) - put_page_vector(pages, num_pages); + ceph_put_page_vector(pages, num_pages); else ceph_release_page_vector(pages, num_pages); dout("sync_read result %d\n", ret); @@ -594,7 +438,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, { struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req; struct page **pages; int num_pages; @@ -642,7 +486,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, */ more: len = left; - req = ceph_osdc_new_request(&client->osdc, &ci->i_layout, + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), pos, &len, CEPH_OSD_OP_WRITE, flags, ci->i_snap_realm->cached_context, @@ -655,7 +499,7 @@ more: num_pages = calc_pages_for(pos, len); if (file->f_flags & O_DIRECT) { - pages = get_direct_page_vector(data, num_pages, pos, len); + pages = ceph_get_direct_page_vector(data, num_pages, pos, len); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; @@ -673,7 +517,7 @@ more: ret = PTR_ERR(pages); goto out; } - ret = copy_user_to_page_vector(pages, data, pos, len); + ret = ceph_copy_user_to_page_vector(pages, data, pos, len); if (ret < 0) { ceph_release_page_vector(pages, num_pages); goto out; @@ -689,7 +533,7 @@ more: req->r_num_pages = num_pages; req->r_inode = inode; - ret = ceph_osdc_start_request(&client->osdc, req, false); + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) { if (req->r_safe_callback) { /* @@ -701,11 +545,11 @@ more: spin_unlock(&ci->i_unsafe_lock); ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); } - ret = ceph_osdc_wait_request(&client->osdc, req); + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); } if (file->f_flags & O_DIRECT) - put_page_vector(pages, num_pages); + ceph_put_page_vector(pages, num_pages); else if (file->f_flags & O_SYNC) ceph_release_page_vector(pages, num_pages); @@ -814,7 +658,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; + struct ceph_osd_client *osdc = + &ceph_sb_to_client(inode->i_sb)->client->osdc; loff_t endoff = pos + iov->iov_len; int want, got = 0; int ret, err; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 62377ec37edf..1d6a45b5a04c 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include #include #include @@ -13,7 +13,8 @@ #include #include "super.h" -#include "decode.h" +#include "mds_client.h" +#include /* * Ceph inode operations @@ -384,7 +385,7 @@ void ceph_destroy_inode(struct inode *inode) */ if (ci->i_snap_realm) { struct ceph_mds_client *mdsc = - &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct ceph_snap_realm *realm = ci->i_snap_realm; dout(" dropping residual ref to snap realm %p\n", realm); @@ -685,7 +686,7 @@ static int fill_inode(struct inode *inode, } /* it may be better to set st_size in getattr instead? */ - if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) + if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) inode->i_size = ci->i_rbytes; break; default: @@ -901,7 +902,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, struct inode *in = NULL; struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; - struct ceph_client *client = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); int i = 0; int err = 0; @@ -965,7 +966,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, */ if (rinfo->head->is_dentry && !req->r_aborted && (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, - client->mount_args->snapdir_name, + fsc->mount_options->snapdir_name, req->r_dentry->d_name.len))) { /* * lookup link rename : null -> possibly existing inode @@ -1533,7 +1534,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) struct inode *parent_inode = dentry->d_parent->d_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; - struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; int issued; int release = 0, dirtied = 0; int mask = 0; @@ -1728,8 +1729,8 @@ out: */ int ceph_do_getattr(struct inode *inode, int mask) { - struct ceph_client *client = ceph_sb_to_client(inode->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 76e307d2aba1..8888c9ba68db 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -1,8 +1,10 @@ #include -#include "ioctl.h" #include "super.h" -#include "ceph_debug.h" +#include "mds_client.h" +#include + +#include "ioctl.h" /* @@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { struct inode *inode = file->f_dentry->d_inode; struct inode *parent_inode = file->f_dentry->d_parent->d_inode; - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; int err, i; @@ -89,6 +91,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) return err; } +/* + * Set a layout policy on a directory inode. All items in the tree + * rooted at this inode will inherit this layout on creation, + * (It doesn't apply retroactively ) + * unless a subdirectory has its own layout policy. + */ +static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_mds_request *req; + struct ceph_ioctl_layout l; + int err, i; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + + /* copy and validate */ + if (copy_from_user(&l, arg, sizeof(l))) + return -EFAULT; + + if ((l.object_size & ~PAGE_MASK) || + (l.stripe_unit & ~PAGE_MASK) || + !l.stripe_unit || + (l.object_size && + (unsigned)l.object_size % (unsigned)l.stripe_unit)) + return -EINVAL; + + /* make sure it's a valid data pool */ + if (l.data_pool > 0) { + mutex_lock(&mdsc->mutex); + err = -EINVAL; + for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) + if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) { + err = 0; + break; + } + mutex_unlock(&mdsc->mutex); + if (err) + return err; + } + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT, + USE_AUTH_MDS); + + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = igrab(inode); + + req->r_args.setlayout.layout.fl_stripe_unit = + cpu_to_le32(l.stripe_unit); + req->r_args.setlayout.layout.fl_stripe_count = + cpu_to_le32(l.stripe_count); + req->r_args.setlayout.layout.fl_object_size = + cpu_to_le32(l.object_size); + req->r_args.setlayout.layout.fl_pg_pool = + cpu_to_le32(l.data_pool); + req->r_args.setlayout.layout.fl_pg_preferred = + cpu_to_le32(l.preferred_osd); + + err = ceph_mdsc_do_request(mdsc, inode, req); + ceph_mdsc_put_request(req); + return err; +} + /* * Return object name, size/offset information, and location (OSD * number, network address) for a given file offset. @@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct ceph_ioctl_dataloc dl; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; + struct ceph_osd_client *osdc = + &ceph_sb_to_client(inode->i_sb)->client->osdc; u64 len = 1, olen; u64 tmp; struct ceph_object_layout ol; @@ -174,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case CEPH_IOC_SET_LAYOUT: return ceph_ioctl_set_layout(file, (void __user *)arg); + case CEPH_IOC_SET_LAYOUT_POLICY: + return ceph_ioctl_set_layout_policy(file, (void __user *)arg); + case CEPH_IOC_GET_DATALOC: return ceph_ioctl_get_dataloc(file, (void __user *)arg); case CEPH_IOC_LAZYIO: return ceph_ioctl_lazyio(file); } + return -ENOTTY; } diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index 88451a3b6857..a6ce54e94eb5 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -4,7 +4,7 @@ #include #include -#define CEPH_IOCTL_MAGIC 0x97 +#define CEPH_IOCTL_MAGIC 0x98 /* just use u64 to align sanely on all archs */ struct ceph_ioctl_layout { @@ -17,6 +17,8 @@ struct ceph_ioctl_layout { struct ceph_ioctl_layout) #define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ struct ceph_ioctl_layout) +#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \ + struct ceph_ioctl_layout) /* * Extract identity, address of the OSD and object storing a given diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ff4e753aae92..40abde93c345 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -1,11 +1,11 @@ -#include "ceph_debug.h" +#include #include #include #include "super.h" #include "mds_client.h" -#include "pagelist.h" +#include /** * Implement fcntl and flock locking functions. @@ -16,7 +16,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, { struct inode *inode = file->f_dentry->d_inode; struct ceph_mds_client *mdsc = - &ceph_sb_to_client(inode->i_sb)->mdsc; + ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; int err; @@ -181,8 +181,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) * Encode the flock and fcntl locks for the given inode into the pagelist. * Format is: #fcntl locks, sequential fcntl locks, #flock locks, * sequential flock locks. - * Must be called with BLK already held, and the lock numbers should have - * been gathered under the same lock holding window. + * Must be called with lock_flocks() already held. + * If we encounter more of a specific lock type than expected, + * we return the value 1. */ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, int num_fcntl_locks, int num_flock_locks) @@ -190,6 +191,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, struct file_lock *lock; struct ceph_filelock cephlock; int err = 0; + int seen_fcntl = 0; + int seen_flock = 0; dout("encoding %d flock and %d fcntl locks", num_flock_locks, num_fcntl_locks); @@ -198,6 +201,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, goto fail; for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { if (lock->fl_flags & FL_POSIX) { + ++seen_fcntl; + if (seen_fcntl > num_fcntl_locks) { + err = -ENOSPC; + goto fail; + } err = lock_to_ceph_filelock(lock, &cephlock); if (err) goto fail; @@ -213,6 +221,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, goto fail; for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { if (lock->fl_flags & FL_FLOCK) { + ++seen_flock; + if (seen_flock > num_flock_locks) { + err = -ENOSPC; + goto fail; + } err = lock_to_ceph_filelock(lock, &cephlock); if (err) goto fail; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index fad95f8f2608..3142b15940c2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1,17 +1,21 @@ -#include "ceph_debug.h" +#include +#include #include #include #include +#include +#include #include -#include "mds_client.h" -#include "mon_client.h" #include "super.h" -#include "messenger.h" -#include "decode.h" -#include "auth.h" -#include "pagelist.h" +#include "mds_client.h" + +#include +#include +#include +#include +#include /* * A cluster of MDS (metadata server) daemons is responsible for @@ -286,8 +290,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s) atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); if (atomic_dec_and_test(&s->s_ref)) { if (s->s_authorizer) - s->s_mdsc->client->monc.auth->ops->destroy_authorizer( - s->s_mdsc->client->monc.auth, s->s_authorizer); + s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer( + s->s_mdsc->fsc->client->monc.auth, + s->s_authorizer); kfree(s); } } @@ -344,7 +349,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_seq = 0; mutex_init(&s->s_mutex); - ceph_con_init(mdsc->client->msgr, &s->s_con); + ceph_con_init(mdsc->fsc->client->msgr, &s->s_con); s->s_con.private = s; s->s_con.ops = &mds_con_ops; s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; @@ -599,7 +604,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } else if (req->r_dentry) { struct inode *dir = req->r_dentry->d_parent->d_inode; - if (dir->i_sb != mdsc->client->sb) { + if (dir->i_sb != mdsc->fsc->sb) { /* not this fs! */ inode = req->r_dentry->d_inode; } else if (ceph_snap(dir) != CEPH_NOSNAP) { @@ -884,7 +889,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, __ceph_remove_cap(cap); if (!__ceph_is_any_real_caps(ci)) { struct ceph_mds_client *mdsc = - &ceph_sb_to_client(inode->i_sb)->mdsc; + ceph_sb_to_client(inode->i_sb)->mdsc; spin_lock(&mdsc->cap_dirty_lock); if (!list_empty(&ci->i_dirty_item)) { @@ -1146,7 +1151,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, struct ceph_msg *msg, *partial = NULL; struct ceph_mds_cap_release *head; int err = -ENOMEM; - int extra = mdsc->client->mount_args->cap_release_safety; + int extra = mdsc->fsc->mount_options->cap_release_safety; int num; dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, @@ -2085,7 +2090,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* insert trace into our cache */ mutex_lock(&req->r_fill_mutex); - err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); + err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); if (err == 0) { if (result == 0 && rinfo->dir_nr) ceph_readdir_prepopulate(req, req->r_session); @@ -2361,19 +2366,35 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, if (recon_state->flock) { int num_fcntl_locks, num_flock_locks; + struct ceph_pagelist_cursor trunc_point; - lock_kernel(); - ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); - rec.v2.flock_len = (2*sizeof(u32) + - (num_fcntl_locks+num_flock_locks) * - sizeof(struct ceph_filelock)); + ceph_pagelist_set_cursor(pagelist, &trunc_point); + do { + lock_flocks(); + ceph_count_locks(inode, &num_fcntl_locks, + &num_flock_locks); + rec.v2.flock_len = (2*sizeof(u32) + + (num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock)); + unlock_flocks(); - err = ceph_pagelist_append(pagelist, &rec, reclen); - if (!err) - err = ceph_encode_locks(inode, pagelist, - num_fcntl_locks, - num_flock_locks); - unlock_kernel(); + /* pre-alloc pagelist */ + ceph_pagelist_truncate(pagelist, &trunc_point); + err = ceph_pagelist_append(pagelist, &rec, reclen); + if (!err) + err = ceph_pagelist_reserve(pagelist, + rec.v2.flock_len); + + /* encode locks */ + if (!err) { + lock_flocks(); + err = ceph_encode_locks(inode, + pagelist, + num_fcntl_locks, + num_flock_locks); + unlock_flocks(); + } + } while (err == -ENOSPC); } else { err = ceph_pagelist_append(pagelist, &rec, reclen); } @@ -2613,7 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { - struct super_block *sb = mdsc->client->sb; + struct super_block *sb = mdsc->fsc->sb; struct inode *inode; struct ceph_inode_info *ci; struct dentry *parent, *dentry; @@ -2891,10 +2912,16 @@ static void delayed_work(struct work_struct *work) schedule_delayed(mdsc); } +int ceph_mdsc_init(struct ceph_fs_client *fsc) -int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) { - mdsc->client = client; + struct ceph_mds_client *mdsc; + + mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS); + if (!mdsc) + return -ENOMEM; + mdsc->fsc = fsc; + fsc->mdsc = mdsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); if (mdsc->mdsmap == NULL) @@ -2927,7 +2954,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) INIT_LIST_HEAD(&mdsc->dentry_lru); ceph_caps_init(mdsc); - ceph_adjust_min_caps(mdsc, client->min_caps); + ceph_adjust_min_caps(mdsc, fsc->min_caps); return 0; } @@ -2939,7 +2966,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) static void wait_requests(struct ceph_mds_client *mdsc) { struct ceph_mds_request *req; - struct ceph_client *client = mdsc->client; + struct ceph_fs_client *fsc = mdsc->fsc; mutex_lock(&mdsc->mutex); if (__get_oldest_req(mdsc)) { @@ -2947,7 +2974,7 @@ static void wait_requests(struct ceph_mds_client *mdsc) dout("wait_requests waiting for requests\n"); wait_for_completion_timeout(&mdsc->safe_umount_waiters, - client->mount_args->mount_timeout * HZ); + fsc->client->options->mount_timeout * HZ); /* tear down remaining requests */ mutex_lock(&mdsc->mutex); @@ -3030,7 +3057,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { u64 want_tid, want_flush; - if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) + if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) return; dout("sync\n"); @@ -3053,7 +3080,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc) { int i, n = 0; - if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) + if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) return true; mutex_lock(&mdsc->mutex); @@ -3071,8 +3098,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { struct ceph_mds_session *session; int i; - struct ceph_client *client = mdsc->client; - unsigned long timeout = client->mount_args->mount_timeout * HZ; + struct ceph_fs_client *fsc = mdsc->fsc; + unsigned long timeout = fsc->client->options->mount_timeout * HZ; dout("close_sessions\n"); @@ -3119,7 +3146,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) dout("stopped\n"); } -void ceph_mdsc_stop(struct ceph_mds_client *mdsc) +static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) { dout("stop\n"); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ @@ -3129,6 +3156,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc) ceph_caps_finalize(mdsc); } +void ceph_mdsc_destroy(struct ceph_fs_client *fsc) +{ + struct ceph_mds_client *mdsc = fsc->mdsc; + + ceph_mdsc_stop(mdsc); + fsc->mdsc = NULL; + kfree(mdsc); +} + /* * handle mds map update. @@ -3145,14 +3181,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); - if (ceph_check_fsid(mdsc->client, &fsid) < 0) + if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0) return; epoch = ceph_decode_32(&p); maplen = ceph_decode_32(&p); dout("handle_map epoch %u len %d\n", epoch, (int)maplen); /* do we need it? */ - ceph_monc_got_mdsmap(&mdsc->client->monc, epoch); + ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch); mutex_lock(&mdsc->mutex); if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { dout("handle_map epoch %u <= our %u\n", @@ -3176,7 +3212,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) } else { mdsc->mdsmap = newmap; /* first mds map */ } - mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; + mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; __wake_requests(mdsc, &mdsc->waiting_for_map); @@ -3277,7 +3313,7 @@ static int get_authorizer(struct ceph_connection *con, { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - struct ceph_auth_client *ac = mdsc->client->monc.auth; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; int ret = 0; if (force_new && s->s_authorizer) { @@ -3311,7 +3347,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - struct ceph_auth_client *ac = mdsc->client->monc.auth; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); } @@ -3320,12 +3356,12 @@ static int invalidate_authorizer(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - struct ceph_auth_client *ac = mdsc->client->monc.auth; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; if (ac->ops->invalidate_authorizer) ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); - return ceph_monc_validate_auth(&mdsc->client->monc); + return ceph_monc_validate_auth(&mdsc->fsc->client->monc); } static const struct ceph_connection_operations mds_con_ops = { @@ -3338,7 +3374,4 @@ static const struct ceph_connection_operations mds_con_ops = { .peer_reset = peer_reset, }; - - - /* eof */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index c98267ce6d2a..d66d63c72355 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -8,9 +8,9 @@ #include #include -#include "types.h" -#include "messenger.h" -#include "mdsmap.h" +#include +#include +#include /* * Some lock dependencies: @@ -26,7 +26,7 @@ * */ -struct ceph_client; +struct ceph_fs_client; struct ceph_cap; /* @@ -230,7 +230,7 @@ struct ceph_mds_request { * mds client state */ struct ceph_mds_client { - struct ceph_client *client; + struct ceph_fs_client *fsc; struct mutex mutex; /* all nested structures */ struct ceph_mdsmap *mdsmap; @@ -289,11 +289,6 @@ struct ceph_mds_client { int caps_avail_count; /* unused, unreserved */ int caps_min_count; /* keep at least this many (unreserved) */ - -#ifdef CONFIG_DEBUG_FS - struct dentry *debugfs_file; -#endif - spinlock_t dentry_lru_lock; struct list_head dentry_lru; int num_dentry; @@ -316,10 +311,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s); extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, struct ceph_msg *msg, int mds); -extern int ceph_mdsc_init(struct ceph_mds_client *mdsc, - struct ceph_client *client); +extern int ceph_mdsc_init(struct ceph_fs_client *fsc); extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); -extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc); +extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 040be6d1150b..73b7d44e8a35 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include #include #include @@ -6,9 +6,9 @@ #include #include -#include "mdsmap.h" -#include "messenger.h" -#include "decode.h" +#include +#include +#include #include "super.h" @@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) } dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", - i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr), + i+1, n, global_id, mds, inc, + ceph_pr_addr(&addr.in_addr), ceph_mds_state_name(state)); if (mds >= 0 && mds < m->m_max_mds && state > 0) { m->m_info[mds].global_id = global_id; diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c deleted file mode 100644 index 46a368b6dce5..000000000000 --- a/fs/ceph/pagelist.c +++ /dev/null @@ -1,63 +0,0 @@ - -#include -#include -#include - -#include "pagelist.h" - -static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) -{ - struct page *page = list_entry(pl->head.prev, struct page, - lru); - kunmap(page); -} - -int ceph_pagelist_release(struct ceph_pagelist *pl) -{ - if (pl->mapped_tail) - ceph_pagelist_unmap_tail(pl); - - while (!list_empty(&pl->head)) { - struct page *page = list_first_entry(&pl->head, struct page, - lru); - list_del(&page->lru); - __free_page(page); - } - return 0; -} - -static int ceph_pagelist_addpage(struct ceph_pagelist *pl) -{ - struct page *page = __page_cache_alloc(GFP_NOFS); - if (!page) - return -ENOMEM; - pl->room += PAGE_SIZE; - list_add_tail(&page->lru, &pl->head); - if (pl->mapped_tail) - ceph_pagelist_unmap_tail(pl); - pl->mapped_tail = kmap(page); - return 0; -} - -int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len) -{ - while (pl->room < len) { - size_t bit = pl->room; - int ret; - - memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), - buf, bit); - pl->length += bit; - pl->room -= bit; - buf += bit; - len -= bit; - ret = ceph_pagelist_addpage(pl); - if (ret) - return ret; - } - - memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len); - pl->length += len; - pl->room -= len; - return 0; -} diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 190b6c4a6f2b..39c243acd062 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -1,10 +1,12 @@ -#include "ceph_debug.h" +#include #include #include #include "super.h" -#include "decode.h" +#include "mds_client.h" + +#include /* * Snapshots in ceph are driven in large part by cooperation from the @@ -526,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { struct inode *inode = &ci->vfs_inode; - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; BUG_ON(capsnap->writing); capsnap->size = inode->i_size; @@ -747,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { - struct super_block *sb = mdsc->client->sb; + struct super_block *sb = mdsc->fsc->sb; int mds = session->s_mds; u64 split; int op; diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c similarity index 59% rename from fs/ceph/ceph_strings.c rename to fs/ceph/strings.c index c6179d3a26a2..cd5097d7c804 100644 --- a/fs/ceph/ceph_strings.c +++ b/fs/ceph/strings.c @@ -1,71 +1,9 @@ /* - * Ceph string constants + * Ceph fs string constants */ -#include "types.h" +#include +#include -const char *ceph_entity_type_name(int type) -{ - switch (type) { - case CEPH_ENTITY_TYPE_MDS: return "mds"; - case CEPH_ENTITY_TYPE_OSD: return "osd"; - case CEPH_ENTITY_TYPE_MON: return "mon"; - case CEPH_ENTITY_TYPE_CLIENT: return "client"; - case CEPH_ENTITY_TYPE_AUTH: return "auth"; - default: return "unknown"; - } -} - -const char *ceph_osd_op_name(int op) -{ - switch (op) { - case CEPH_OSD_OP_READ: return "read"; - case CEPH_OSD_OP_STAT: return "stat"; - - case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; - - case CEPH_OSD_OP_WRITE: return "write"; - case CEPH_OSD_OP_DELETE: return "delete"; - case CEPH_OSD_OP_TRUNCATE: return "truncate"; - case CEPH_OSD_OP_ZERO: return "zero"; - case CEPH_OSD_OP_WRITEFULL: return "writefull"; - case CEPH_OSD_OP_ROLLBACK: return "rollback"; - - case CEPH_OSD_OP_APPEND: return "append"; - case CEPH_OSD_OP_STARTSYNC: return "startsync"; - case CEPH_OSD_OP_SETTRUNC: return "settrunc"; - case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc"; - - case CEPH_OSD_OP_TMAPUP: return "tmapup"; - case CEPH_OSD_OP_TMAPGET: return "tmapget"; - case CEPH_OSD_OP_TMAPPUT: return "tmapput"; - - case CEPH_OSD_OP_GETXATTR: return "getxattr"; - case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; - case CEPH_OSD_OP_SETXATTR: return "setxattr"; - case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; - case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; - case CEPH_OSD_OP_RMXATTR: return "rmxattr"; - case CEPH_OSD_OP_CMPXATTR: return "cmpxattr"; - - case CEPH_OSD_OP_PULL: return "pull"; - case CEPH_OSD_OP_PUSH: return "push"; - case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; - case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; - case CEPH_OSD_OP_SCRUB: return "scrub"; - - case CEPH_OSD_OP_WRLOCK: return "wrlock"; - case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; - case CEPH_OSD_OP_RDLOCK: return "rdlock"; - case CEPH_OSD_OP_RDUNLOCK: return "rdunlock"; - case CEPH_OSD_OP_UPLOCK: return "uplock"; - case CEPH_OSD_OP_DNLOCK: return "dnlock"; - - case CEPH_OSD_OP_CALL: return "call"; - - case CEPH_OSD_OP_PGLS: return "pgls"; - } - return "???"; -} const char *ceph_mds_state_name(int s) { @@ -177,17 +115,3 @@ const char *ceph_snap_op_name(int o) } return "???"; } - -const char *ceph_pool_op_name(int op) -{ - switch (op) { - case POOL_OP_CREATE: return "create"; - case POOL_OP_DELETE: return "delete"; - case POOL_OP_AUID_CHANGE: return "auid change"; - case POOL_OP_CREATE_SNAP: return "create snap"; - case POOL_OP_DELETE_SNAP: return "delete snap"; - case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; - case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; - } - return "???"; -} diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9922628532b2..d6e0e0421891 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1,5 +1,5 @@ -#include "ceph_debug.h" +#include #include #include @@ -15,10 +15,13 @@ #include #include -#include "decode.h" #include "super.h" -#include "mon_client.h" -#include "auth.h" +#include "mds_client.h" + +#include +#include +#include +#include /* * Ceph superblock operations @@ -26,36 +29,22 @@ * Handle the basics of mounting, unmounting. */ - -/* - * find filename portion of a path (/foo/bar/baz -> baz) - */ -const char *ceph_file_part(const char *s, int len) -{ - const char *e = s + len; - - while (e != s && *(e-1) != '/') - e--; - return e; -} - - /* * super ops */ static void ceph_put_super(struct super_block *s) { - struct ceph_client *client = ceph_sb_to_client(s); + struct ceph_fs_client *fsc = ceph_sb_to_client(s); dout("put_super\n"); - ceph_mdsc_close_sessions(&client->mdsc); + ceph_mdsc_close_sessions(fsc->mdsc); /* * ensure we release the bdi before put_anon_super releases * the device name. */ - if (s->s_bdi == &client->backing_dev_info) { - bdi_unregister(&client->backing_dev_info); + if (s->s_bdi == &fsc->backing_dev_info) { + bdi_unregister(&fsc->backing_dev_info); s->s_bdi = NULL; } @@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s) static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct ceph_client *client = ceph_inode_to_client(dentry->d_inode); - struct ceph_monmap *monmap = client->monc.monmap; + struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode); + struct ceph_monmap *monmap = fsc->client->monc.monmap; struct ceph_statfs st; u64 fsid; int err; dout("statfs\n"); - err = ceph_monc_do_statfs(&client->monc, &st); + err = ceph_monc_do_statfs(&fsc->client->monc, &st); if (err < 0) return err; @@ -104,49 +93,237 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) static int ceph_sync_fs(struct super_block *sb, int wait) { - struct ceph_client *client = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); if (!wait) { dout("sync_fs (non-blocking)\n"); - ceph_flush_dirty_caps(&client->mdsc); + ceph_flush_dirty_caps(fsc->mdsc); dout("sync_fs (non-blocking) done\n"); return 0; } dout("sync_fs (blocking)\n"); - ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); - ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); + ceph_osdc_sync(&fsc->client->osdc); + ceph_mdsc_sync(fsc->mdsc); dout("sync_fs (blocking) done\n"); return 0; } -static int default_congestion_kb(void) +/* + * mount options + */ +enum { + Opt_wsize, + Opt_rsize, + Opt_caps_wanted_delay_min, + Opt_caps_wanted_delay_max, + Opt_cap_release_safety, + Opt_readdir_max_entries, + Opt_readdir_max_bytes, + Opt_congestion_kb, + Opt_last_int, + /* int args above */ + Opt_snapdirname, + Opt_last_string, + /* string args above */ + Opt_dirstat, + Opt_nodirstat, + Opt_rbytes, + Opt_norbytes, + Opt_noasyncreaddir, +}; + +static match_table_t fsopt_tokens = { + {Opt_wsize, "wsize=%d"}, + {Opt_rsize, "rsize=%d"}, + {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, + {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, + {Opt_cap_release_safety, "cap_release_safety=%d"}, + {Opt_readdir_max_entries, "readdir_max_entries=%d"}, + {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, + {Opt_congestion_kb, "write_congestion_kb=%d"}, + /* int args above */ + {Opt_snapdirname, "snapdirname=%s"}, + /* string args above */ + {Opt_dirstat, "dirstat"}, + {Opt_nodirstat, "nodirstat"}, + {Opt_rbytes, "rbytes"}, + {Opt_norbytes, "norbytes"}, + {Opt_noasyncreaddir, "noasyncreaddir"}, + {-1, NULL} +}; + +static int parse_fsopt_token(char *c, void *private) { - int congestion_kb; + struct ceph_mount_options *fsopt = private; + substring_t argstr[MAX_OPT_ARGS]; + int token, intval, ret; - /* - * Copied from NFS - * - * congestion size, scale with available memory. - * - * 64MB: 8192k - * 128MB: 11585k - * 256MB: 16384k - * 512MB: 23170k - * 1GB: 32768k - * 2GB: 46340k - * 4GB: 65536k - * 8GB: 92681k - * 16GB: 131072k - * - * This allows larger machines to have larger/more transfers. - * Limit the default to 256M - */ - congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); - if (congestion_kb > 256*1024) - congestion_kb = 256*1024; + token = match_token((char *)c, fsopt_tokens, argstr); + if (token < 0) + return -EINVAL; - return congestion_kb; + if (token < Opt_last_int) { + ret = match_int(&argstr[0], &intval); + if (ret < 0) { + pr_err("bad mount option arg (not int) " + "at '%s'\n", c); + return ret; + } + dout("got int token %d val %d\n", token, intval); + } else if (token > Opt_last_int && token < Opt_last_string) { + dout("got string token %d val %s\n", token, + argstr[0].from); + } else { + dout("got token %d\n", token); + } + + switch (token) { + case Opt_snapdirname: + kfree(fsopt->snapdir_name); + fsopt->snapdir_name = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + if (!fsopt->snapdir_name) + return -ENOMEM; + break; + + /* misc */ + case Opt_wsize: + fsopt->wsize = intval; + break; + case Opt_rsize: + fsopt->rsize = intval; + break; + case Opt_caps_wanted_delay_min: + fsopt->caps_wanted_delay_min = intval; + break; + case Opt_caps_wanted_delay_max: + fsopt->caps_wanted_delay_max = intval; + break; + case Opt_readdir_max_entries: + fsopt->max_readdir = intval; + break; + case Opt_readdir_max_bytes: + fsopt->max_readdir_bytes = intval; + break; + case Opt_congestion_kb: + fsopt->congestion_kb = intval; + break; + case Opt_dirstat: + fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; + break; + case Opt_nodirstat: + fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; + break; + case Opt_rbytes: + fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; + break; + case Opt_norbytes: + fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; + break; + case Opt_noasyncreaddir: + fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; + break; + default: + BUG_ON(token); + } + return 0; +} + +static void destroy_mount_options(struct ceph_mount_options *args) +{ + dout("destroy_mount_options %p\n", args); + kfree(args->snapdir_name); + kfree(args); +} + +static int strcmp_null(const char *s1, const char *s2) +{ + if (!s1 && !s2) + return 0; + if (s1 && !s2) + return -1; + if (!s1 && s2) + return 1; + return strcmp(s1, s2); +} + +static int compare_mount_options(struct ceph_mount_options *new_fsopt, + struct ceph_options *new_opt, + struct ceph_fs_client *fsc) +{ + struct ceph_mount_options *fsopt1 = new_fsopt; + struct ceph_mount_options *fsopt2 = fsc->mount_options; + int ofs = offsetof(struct ceph_mount_options, snapdir_name); + int ret; + + ret = memcmp(fsopt1, fsopt2, ofs); + if (ret) + return ret; + + ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name); + if (ret) + return ret; + + return ceph_compare_options(new_opt, fsc->client); +} + +static int parse_mount_options(struct ceph_mount_options **pfsopt, + struct ceph_options **popt, + int flags, char *options, + const char *dev_name, + const char **path) +{ + struct ceph_mount_options *fsopt; + const char *dev_name_end; + int err = -ENOMEM; + + fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); + if (!fsopt) + return -ENOMEM; + + dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); + + fsopt->sb_flags = flags; + fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + + fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT; + fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; + fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; + fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; + fsopt->congestion_kb = default_congestion_kb(); + + /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ + err = -EINVAL; + if (!dev_name) + goto out; + *path = strstr(dev_name, ":/"); + if (*path == NULL) { + pr_err("device name is missing path (no :/ in %s)\n", + dev_name); + goto out; + } + dev_name_end = *path; + dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); + + /* path on server */ + *path += 2; + dout("server path '%s'\n", *path); + + err = ceph_parse_options(popt, options, dev_name, dev_name_end, + parse_fsopt_token, (void *)fsopt); + if (err) + goto out; + + /* success */ + *pfsopt = fsopt; + return 0; + +out: + destroy_mount_options(fsopt); + return err; } /** @@ -156,59 +333,175 @@ static int default_congestion_kb(void) */ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) { - struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb); - struct ceph_mount_args *args = client->mount_args; + struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb); + struct ceph_mount_options *fsopt = fsc->mount_options; + struct ceph_options *opt = fsc->client->options; - if (args->flags & CEPH_OPT_FSID) - seq_printf(m, ",fsid=%pU", &args->fsid); - if (args->flags & CEPH_OPT_NOSHARE) + if (opt->flags & CEPH_OPT_FSID) + seq_printf(m, ",fsid=%pU", &opt->fsid); + if (opt->flags & CEPH_OPT_NOSHARE) seq_puts(m, ",noshare"); - if (args->flags & CEPH_OPT_DIRSTAT) - seq_puts(m, ",dirstat"); - if ((args->flags & CEPH_OPT_RBYTES) == 0) - seq_puts(m, ",norbytes"); - if (args->flags & CEPH_OPT_NOCRC) + if (opt->flags & CEPH_OPT_NOCRC) seq_puts(m, ",nocrc"); - if (args->flags & CEPH_OPT_NOASYNCREADDIR) + + if (opt->name) + seq_printf(m, ",name=%s", opt->name); + if (opt->secret) + seq_puts(m, ",secret="); + + if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) + seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); + if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) + seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); + if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) + seq_printf(m, ",osdtimeout=%d", opt->osd_timeout); + if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) + seq_printf(m, ",osdkeepalivetimeout=%d", + opt->osd_keepalive_timeout); + + if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) + seq_puts(m, ",dirstat"); + if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0) + seq_puts(m, ",norbytes"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) seq_puts(m, ",noasyncreaddir"); - if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) - seq_printf(m, ",mount_timeout=%d", args->mount_timeout); - if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) - seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl); - if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) - seq_printf(m, ",osdtimeout=%d", args->osd_timeout); - if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) - seq_printf(m, ",osdkeepalivetimeout=%d", - args->osd_keepalive_timeout); - if (args->wsize) - seq_printf(m, ",wsize=%d", args->wsize); - if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT) - seq_printf(m, ",rsize=%d", args->rsize); - if (args->congestion_kb != default_congestion_kb()) - seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb); - if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) + if (fsopt->wsize) + seq_printf(m, ",wsize=%d", fsopt->wsize); + if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT) + seq_printf(m, ",rsize=%d", fsopt->rsize); + if (fsopt->congestion_kb != default_congestion_kb()) + seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); + if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) seq_printf(m, ",caps_wanted_delay_min=%d", - args->caps_wanted_delay_min); - if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) + fsopt->caps_wanted_delay_min); + if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) seq_printf(m, ",caps_wanted_delay_max=%d", - args->caps_wanted_delay_max); - if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) + fsopt->caps_wanted_delay_max); + if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) seq_printf(m, ",cap_release_safety=%d", - args->cap_release_safety); - if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT) - seq_printf(m, ",readdir_max_entries=%d", args->max_readdir); - if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) - seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes); - if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) - seq_printf(m, ",snapdirname=%s", args->snapdir_name); - if (args->name) - seq_printf(m, ",name=%s", args->name); - if (args->secret) - seq_puts(m, ",secret="); + fsopt->cap_release_safety); + if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) + seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); + if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) + seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); + if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) + seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); return 0; } +/* + * handle any mon messages the standard library doesn't understand. + * return error if we don't either. + */ +static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) +{ + struct ceph_fs_client *fsc = client->private; + int type = le16_to_cpu(msg->hdr.type); + + switch (type) { + case CEPH_MSG_MDS_MAP: + ceph_mdsc_handle_map(fsc->mdsc, msg); + return 0; + + default: + return -1; + } +} + +/* + * create a new fs client + */ +struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, + struct ceph_options *opt) +{ + struct ceph_fs_client *fsc; + int err = -ENOMEM; + + fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); + if (!fsc) + return ERR_PTR(-ENOMEM); + + fsc->client = ceph_create_client(opt, fsc); + if (IS_ERR(fsc->client)) { + err = PTR_ERR(fsc->client); + goto fail; + } + fsc->client->extra_mon_dispatch = extra_mon_dispatch; + fsc->client->supported_features |= CEPH_FEATURE_FLOCK; + fsc->client->monc.want_mdsmap = 1; + + fsc->mount_options = fsopt; + + fsc->sb = NULL; + fsc->mount_state = CEPH_MOUNT_MOUNTING; + + atomic_long_set(&fsc->writeback_count, 0); + + err = bdi_init(&fsc->backing_dev_info); + if (err < 0) + goto fail_client; + + err = -ENOMEM; + fsc->wb_wq = create_workqueue("ceph-writeback"); + if (fsc->wb_wq == NULL) + goto fail_bdi; + fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); + if (fsc->pg_inv_wq == NULL) + goto fail_wb_wq; + fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc"); + if (fsc->trunc_wq == NULL) + goto fail_pg_inv_wq; + + /* set up mempools */ + err = -ENOMEM; + fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, + fsc->mount_options->wsize >> PAGE_CACHE_SHIFT); + if (!fsc->wb_pagevec_pool) + goto fail_trunc_wq; + + /* caps */ + fsc->min_caps = fsopt->max_readdir; + + return fsc; + +fail_trunc_wq: + destroy_workqueue(fsc->trunc_wq); +fail_pg_inv_wq: + destroy_workqueue(fsc->pg_inv_wq); +fail_wb_wq: + destroy_workqueue(fsc->wb_wq); +fail_bdi: + bdi_destroy(&fsc->backing_dev_info); +fail_client: + ceph_destroy_client(fsc->client); +fail: + kfree(fsc); + return ERR_PTR(err); +} + +void destroy_fs_client(struct ceph_fs_client *fsc) +{ + dout("destroy_fs_client %p\n", fsc); + + destroy_workqueue(fsc->wb_wq); + destroy_workqueue(fsc->pg_inv_wq); + destroy_workqueue(fsc->trunc_wq); + + bdi_destroy(&fsc->backing_dev_info); + + mempool_destroy(fsc->wb_pagevec_pool); + + destroy_mount_options(fsc->mount_options); + + ceph_fs_debugfs_cleanup(fsc); + + ceph_destroy_client(fsc->client); + + kfree(fsc); + dout("destroy_fs_client %p done\n", fsc); +} + /* * caches */ @@ -274,12 +567,12 @@ static void destroy_caches(void) */ static void ceph_umount_begin(struct super_block *sb) { - struct ceph_client *client = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); dout("ceph_umount_begin - starting forced umount\n"); - if (!client) + if (!fsc) return; - client->mount_state = CEPH_MOUNT_SHUTDOWN; + fsc->mount_state = CEPH_MOUNT_SHUTDOWN; return; } @@ -294,483 +587,15 @@ static const struct super_operations ceph_super_ops = { .umount_begin = ceph_umount_begin, }; - -const char *ceph_msg_type_name(int type) -{ - switch (type) { - case CEPH_MSG_SHUTDOWN: return "shutdown"; - case CEPH_MSG_PING: return "ping"; - case CEPH_MSG_AUTH: return "auth"; - case CEPH_MSG_AUTH_REPLY: return "auth_reply"; - case CEPH_MSG_MON_MAP: return "mon_map"; - case CEPH_MSG_MON_GET_MAP: return "mon_get_map"; - case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe"; - case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; - case CEPH_MSG_STATFS: return "statfs"; - case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; - case CEPH_MSG_MDS_MAP: return "mds_map"; - case CEPH_MSG_CLIENT_SESSION: return "client_session"; - case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; - case CEPH_MSG_CLIENT_REQUEST: return "client_request"; - case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward"; - case CEPH_MSG_CLIENT_REPLY: return "client_reply"; - case CEPH_MSG_CLIENT_CAPS: return "client_caps"; - case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; - case CEPH_MSG_CLIENT_SNAP: return "client_snap"; - case CEPH_MSG_CLIENT_LEASE: return "client_lease"; - case CEPH_MSG_OSD_MAP: return "osd_map"; - case CEPH_MSG_OSD_OP: return "osd_op"; - case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; - default: return "unknown"; - } -} - - -/* - * mount options - */ -enum { - Opt_wsize, - Opt_rsize, - Opt_osdtimeout, - Opt_osdkeepalivetimeout, - Opt_mount_timeout, - Opt_osd_idle_ttl, - Opt_caps_wanted_delay_min, - Opt_caps_wanted_delay_max, - Opt_cap_release_safety, - Opt_readdir_max_entries, - Opt_readdir_max_bytes, - Opt_congestion_kb, - Opt_last_int, - /* int args above */ - Opt_fsid, - Opt_snapdirname, - Opt_name, - Opt_secret, - Opt_last_string, - /* string args above */ - Opt_ip, - Opt_noshare, - Opt_dirstat, - Opt_nodirstat, - Opt_rbytes, - Opt_norbytes, - Opt_nocrc, - Opt_noasyncreaddir, -}; - -static match_table_t arg_tokens = { - {Opt_wsize, "wsize=%d"}, - {Opt_rsize, "rsize=%d"}, - {Opt_osdtimeout, "osdtimeout=%d"}, - {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, - {Opt_mount_timeout, "mount_timeout=%d"}, - {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, - {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, - {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, - {Opt_cap_release_safety, "cap_release_safety=%d"}, - {Opt_readdir_max_entries, "readdir_max_entries=%d"}, - {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, - {Opt_congestion_kb, "write_congestion_kb=%d"}, - /* int args above */ - {Opt_fsid, "fsid=%s"}, - {Opt_snapdirname, "snapdirname=%s"}, - {Opt_name, "name=%s"}, - {Opt_secret, "secret=%s"}, - /* string args above */ - {Opt_ip, "ip=%s"}, - {Opt_noshare, "noshare"}, - {Opt_dirstat, "dirstat"}, - {Opt_nodirstat, "nodirstat"}, - {Opt_rbytes, "rbytes"}, - {Opt_norbytes, "norbytes"}, - {Opt_nocrc, "nocrc"}, - {Opt_noasyncreaddir, "noasyncreaddir"}, - {-1, NULL} -}; - -static int parse_fsid(const char *str, struct ceph_fsid *fsid) -{ - int i = 0; - char tmp[3]; - int err = -EINVAL; - int d; - - dout("parse_fsid '%s'\n", str); - tmp[2] = 0; - while (*str && i < 16) { - if (ispunct(*str)) { - str++; - continue; - } - if (!isxdigit(str[0]) || !isxdigit(str[1])) - break; - tmp[0] = str[0]; - tmp[1] = str[1]; - if (sscanf(tmp, "%x", &d) < 1) - break; - fsid->fsid[i] = d & 0xff; - i++; - str += 2; - } - - if (i == 16) - err = 0; - dout("parse_fsid ret %d got fsid %pU", err, fsid); - return err; -} - -static struct ceph_mount_args *parse_mount_args(int flags, char *options, - const char *dev_name, - const char **path) -{ - struct ceph_mount_args *args; - const char *c; - int err = -ENOMEM; - substring_t argstr[MAX_OPT_ARGS]; - - args = kzalloc(sizeof(*args), GFP_KERNEL); - if (!args) - return ERR_PTR(-ENOMEM); - args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr), - GFP_KERNEL); - if (!args->mon_addr) - goto out; - - dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name); - - /* start with defaults */ - args->sb_flags = flags; - args->flags = CEPH_OPT_DEFAULT; - args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; - args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; - args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ - args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ - args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; - args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; - args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; - args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); - args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; - args->max_readdir = CEPH_MAX_READDIR_DEFAULT; - args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; - args->congestion_kb = default_congestion_kb(); - - /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ - err = -EINVAL; - if (!dev_name) - goto out; - *path = strstr(dev_name, ":/"); - if (*path == NULL) { - pr_err("device name is missing path (no :/ in %s)\n", - dev_name); - goto out; - } - - /* get mon ip(s) */ - err = ceph_parse_ips(dev_name, *path, args->mon_addr, - CEPH_MAX_MON, &args->num_mon); - if (err < 0) - goto out; - - /* path on server */ - *path += 2; - dout("server path '%s'\n", *path); - - /* parse mount options */ - while ((c = strsep(&options, ",")) != NULL) { - int token, intval, ret; - if (!*c) - continue; - err = -EINVAL; - token = match_token((char *)c, arg_tokens, argstr); - if (token < 0) { - pr_err("bad mount option at '%s'\n", c); - goto out; - } - if (token < Opt_last_int) { - ret = match_int(&argstr[0], &intval); - if (ret < 0) { - pr_err("bad mount option arg (not int) " - "at '%s'\n", c); - continue; - } - dout("got int token %d val %d\n", token, intval); - } else if (token > Opt_last_int && token < Opt_last_string) { - dout("got string token %d val %s\n", token, - argstr[0].from); - } else { - dout("got token %d\n", token); - } - switch (token) { - case Opt_ip: - err = ceph_parse_ips(argstr[0].from, - argstr[0].to, - &args->my_addr, - 1, NULL); - if (err < 0) - goto out; - args->flags |= CEPH_OPT_MYIP; - break; - - case Opt_fsid: - err = parse_fsid(argstr[0].from, &args->fsid); - if (err == 0) - args->flags |= CEPH_OPT_FSID; - break; - case Opt_snapdirname: - kfree(args->snapdir_name); - args->snapdir_name = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - break; - case Opt_name: - args->name = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - break; - case Opt_secret: - args->secret = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - break; - - /* misc */ - case Opt_wsize: - args->wsize = intval; - break; - case Opt_rsize: - args->rsize = intval; - break; - case Opt_osdtimeout: - args->osd_timeout = intval; - break; - case Opt_osdkeepalivetimeout: - args->osd_keepalive_timeout = intval; - break; - case Opt_osd_idle_ttl: - args->osd_idle_ttl = intval; - break; - case Opt_mount_timeout: - args->mount_timeout = intval; - break; - case Opt_caps_wanted_delay_min: - args->caps_wanted_delay_min = intval; - break; - case Opt_caps_wanted_delay_max: - args->caps_wanted_delay_max = intval; - break; - case Opt_readdir_max_entries: - args->max_readdir = intval; - break; - case Opt_readdir_max_bytes: - args->max_readdir_bytes = intval; - break; - case Opt_congestion_kb: - args->congestion_kb = intval; - break; - - case Opt_noshare: - args->flags |= CEPH_OPT_NOSHARE; - break; - - case Opt_dirstat: - args->flags |= CEPH_OPT_DIRSTAT; - break; - case Opt_nodirstat: - args->flags &= ~CEPH_OPT_DIRSTAT; - break; - case Opt_rbytes: - args->flags |= CEPH_OPT_RBYTES; - break; - case Opt_norbytes: - args->flags &= ~CEPH_OPT_RBYTES; - break; - case Opt_nocrc: - args->flags |= CEPH_OPT_NOCRC; - break; - case Opt_noasyncreaddir: - args->flags |= CEPH_OPT_NOASYNCREADDIR; - break; - - default: - BUG_ON(token); - } - } - return args; - -out: - kfree(args->mon_addr); - kfree(args); - return ERR_PTR(err); -} - -static void destroy_mount_args(struct ceph_mount_args *args) -{ - dout("destroy_mount_args %p\n", args); - kfree(args->snapdir_name); - args->snapdir_name = NULL; - kfree(args->name); - args->name = NULL; - kfree(args->secret); - args->secret = NULL; - kfree(args); -} - -/* - * create a fresh client instance - */ -static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) -{ - struct ceph_client *client; - int err = -ENOMEM; - - client = kzalloc(sizeof(*client), GFP_KERNEL); - if (client == NULL) - return ERR_PTR(-ENOMEM); - - mutex_init(&client->mount_mutex); - - init_waitqueue_head(&client->auth_wq); - - client->sb = NULL; - client->mount_state = CEPH_MOUNT_MOUNTING; - client->mount_args = args; - - client->msgr = NULL; - - client->auth_err = 0; - atomic_long_set(&client->writeback_count, 0); - - err = bdi_init(&client->backing_dev_info); - if (err < 0) - goto fail; - - err = -ENOMEM; - client->wb_wq = create_workqueue("ceph-writeback"); - if (client->wb_wq == NULL) - goto fail_bdi; - client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); - if (client->pg_inv_wq == NULL) - goto fail_wb_wq; - client->trunc_wq = create_singlethread_workqueue("ceph-trunc"); - if (client->trunc_wq == NULL) - goto fail_pg_inv_wq; - - /* set up mempools */ - err = -ENOMEM; - client->wb_pagevec_pool = mempool_create_kmalloc_pool(10, - client->mount_args->wsize >> PAGE_CACHE_SHIFT); - if (!client->wb_pagevec_pool) - goto fail_trunc_wq; - - /* caps */ - client->min_caps = args->max_readdir; - - /* subsystems */ - err = ceph_monc_init(&client->monc, client); - if (err < 0) - goto fail_mempool; - err = ceph_osdc_init(&client->osdc, client); - if (err < 0) - goto fail_monc; - err = ceph_mdsc_init(&client->mdsc, client); - if (err < 0) - goto fail_osdc; - return client; - -fail_osdc: - ceph_osdc_stop(&client->osdc); -fail_monc: - ceph_monc_stop(&client->monc); -fail_mempool: - mempool_destroy(client->wb_pagevec_pool); -fail_trunc_wq: - destroy_workqueue(client->trunc_wq); -fail_pg_inv_wq: - destroy_workqueue(client->pg_inv_wq); -fail_wb_wq: - destroy_workqueue(client->wb_wq); -fail_bdi: - bdi_destroy(&client->backing_dev_info); -fail: - kfree(client); - return ERR_PTR(err); -} - -static void ceph_destroy_client(struct ceph_client *client) -{ - dout("destroy_client %p\n", client); - - /* unmount */ - ceph_mdsc_stop(&client->mdsc); - ceph_osdc_stop(&client->osdc); - - /* - * make sure mds and osd connections close out before destroying - * the auth module, which is needed to free those connections' - * ceph_authorizers. - */ - ceph_msgr_flush(); - - ceph_monc_stop(&client->monc); - - ceph_debugfs_client_cleanup(client); - destroy_workqueue(client->wb_wq); - destroy_workqueue(client->pg_inv_wq); - destroy_workqueue(client->trunc_wq); - - bdi_destroy(&client->backing_dev_info); - - if (client->msgr) - ceph_messenger_destroy(client->msgr); - mempool_destroy(client->wb_pagevec_pool); - - destroy_mount_args(client->mount_args); - - kfree(client); - dout("destroy_client %p done\n", client); -} - -/* - * Initially learn our fsid, or verify an fsid matches. - */ -int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) -{ - if (client->have_fsid) { - if (ceph_fsid_compare(&client->fsid, fsid)) { - pr_err("bad fsid, had %pU got %pU", - &client->fsid, fsid); - return -1; - } - } else { - pr_info("client%lld fsid %pU\n", client->monc.auth->global_id, - fsid); - memcpy(&client->fsid, fsid, sizeof(*fsid)); - ceph_debugfs_client_init(client); - client->have_fsid = true; - } - return 0; -} - -/* - * true if we have the mon map (and have thus joined the cluster) - */ -static int have_mon_and_osd_map(struct ceph_client *client) -{ - return client->monc.monmap && client->monc.monmap->epoch && - client->osdc.osdmap && client->osdc.osdmap->epoch; -} - /* * Bootstrap mount by opening the root directory. Note the mount * @started time from caller, and time out if this takes too long. */ -static struct dentry *open_root_dentry(struct ceph_client *client, +static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, const char *path, unsigned long started) { - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req = NULL; int err; struct dentry *root; @@ -784,14 +609,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client, req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.snap = CEPH_NOSNAP; req->r_started = started; - req->r_timeout = client->mount_args->mount_timeout * HZ; + req->r_timeout = fsc->client->options->mount_timeout * HZ; req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err == 0) { dout("open_root_inode success\n"); if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && - client->sb->s_root == NULL) + fsc->sb->s_root == NULL) root = d_alloc_root(req->r_target_inode); else root = d_obtain_alias(req->r_target_inode); @@ -804,105 +629,86 @@ static struct dentry *open_root_dentry(struct ceph_client *client, return root; } + + + /* * mount: join the ceph cluster, and open root directory. */ -static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, +static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt, const char *path) { - struct ceph_entity_addr *myaddr = NULL; int err; - unsigned long timeout = client->mount_args->mount_timeout * HZ; unsigned long started = jiffies; /* note the start time */ struct dentry *root; + int first = 0; /* first vfsmount for this super_block */ dout("mount start\n"); - mutex_lock(&client->mount_mutex); + mutex_lock(&fsc->client->mount_mutex); - /* initialize the messenger */ - if (client->msgr == NULL) { - if (ceph_test_opt(client, MYIP)) - myaddr = &client->mount_args->my_addr; - client->msgr = ceph_messenger_create(myaddr); - if (IS_ERR(client->msgr)) { - err = PTR_ERR(client->msgr); - client->msgr = NULL; - goto out; - } - client->msgr->nocrc = ceph_test_opt(client, NOCRC); - } - - /* open session, and wait for mon, mds, and osd maps */ - err = ceph_monc_open_session(&client->monc); + err = __ceph_open_session(fsc->client, started); if (err < 0) goto out; - while (!have_mon_and_osd_map(client)) { - err = -EIO; - if (timeout && time_after_eq(jiffies, started + timeout)) - goto out; - - /* wait */ - dout("mount waiting for mon_map\n"); - err = wait_event_interruptible_timeout(client->auth_wq, - have_mon_and_osd_map(client) || (client->auth_err < 0), - timeout); - if (err == -EINTR || err == -ERESTARTSYS) - goto out; - if (client->auth_err < 0) { - err = client->auth_err; - goto out; - } - } - dout("mount opening root\n"); - root = open_root_dentry(client, "", started); + root = open_root_dentry(fsc, "", started); if (IS_ERR(root)) { err = PTR_ERR(root); goto out; } - if (client->sb->s_root) + if (fsc->sb->s_root) { dput(root); - else - client->sb->s_root = root; + } else { + fsc->sb->s_root = root; + first = 1; + + err = ceph_fs_debugfs_init(fsc); + if (err < 0) + goto fail; + } if (path[0] == 0) { dget(root); } else { dout("mount opening base mountpoint\n"); - root = open_root_dentry(client, path, started); + root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); - dput(client->sb->s_root); - client->sb->s_root = NULL; - goto out; + goto fail; } } mnt->mnt_root = root; - mnt->mnt_sb = client->sb; + mnt->mnt_sb = fsc->sb; - client->mount_state = CEPH_MOUNT_MOUNTED; + fsc->mount_state = CEPH_MOUNT_MOUNTED; dout("mount success\n"); err = 0; out: - mutex_unlock(&client->mount_mutex); + mutex_unlock(&fsc->client->mount_mutex); return err; + +fail: + if (first) { + dput(fsc->sb->s_root); + fsc->sb->s_root = NULL; + } + goto out; } static int ceph_set_super(struct super_block *s, void *data) { - struct ceph_client *client = data; + struct ceph_fs_client *fsc = data; int ret; dout("set_super %p data %p\n", s, data); - s->s_flags = client->mount_args->sb_flags; + s->s_flags = fsc->mount_options->sb_flags; s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ - s->s_fs_info = client; - client->sb = s; + s->s_fs_info = fsc; + fsc->sb = s; s->s_op = &ceph_super_ops; s->s_export_op = &ceph_export_ops; @@ -917,7 +723,7 @@ static int ceph_set_super(struct super_block *s, void *data) fail: s->s_fs_info = NULL; - client->sb = NULL; + fsc->sb = NULL; return ret; } @@ -926,30 +732,23 @@ fail: */ static int ceph_compare_super(struct super_block *sb, void *data) { - struct ceph_client *new = data; - struct ceph_mount_args *args = new->mount_args; - struct ceph_client *other = ceph_sb_to_client(sb); - int i; + struct ceph_fs_client *new = data; + struct ceph_mount_options *fsopt = new->mount_options; + struct ceph_options *opt = new->client->options; + struct ceph_fs_client *other = ceph_sb_to_client(sb); dout("ceph_compare_super %p\n", sb); - if (args->flags & CEPH_OPT_FSID) { - if (ceph_fsid_compare(&args->fsid, &other->fsid)) { - dout("fsid doesn't match\n"); - return 0; - } - } else { - /* do we share (a) monitor? */ - for (i = 0; i < new->monc.monmap->num_mon; i++) - if (ceph_monmap_contains(other->monc.monmap, - &new->monc.monmap->mon_inst[i].addr)) - break; - if (i == new->monc.monmap->num_mon) { - dout("mon ip not part of monmap\n"); - return 0; - } - dout("mon ip matches existing sb %p\n", sb); + + if (compare_mount_options(fsopt, opt, other)) { + dout("monitor(s)/mount options don't match\n"); + return 0; } - if (args->sb_flags != other->mount_args->sb_flags) { + if ((opt->flags & CEPH_OPT_FSID) && + ceph_fsid_compare(&opt->fsid, &other->client->fsid)) { + dout("fsid doesn't match\n"); + return 0; + } + if (fsopt->sb_flags != other->mount_options->sb_flags) { dout("flags differ\n"); return 0; } @@ -961,19 +760,20 @@ static int ceph_compare_super(struct super_block *sb, void *data) */ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) +static int ceph_register_bdi(struct super_block *sb, + struct ceph_fs_client *fsc) { int err; /* set ra_pages based on rsize mount option? */ - if (client->mount_args->rsize >= PAGE_CACHE_SIZE) - client->backing_dev_info.ra_pages = - (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) + if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) + fsc->backing_dev_info.ra_pages = + (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; - err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d", + err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", atomic_long_inc_return(&bdi_seq)); if (!err) - sb->s_bdi = &client->backing_dev_info; + sb->s_bdi = &fsc->backing_dev_info; return err; } @@ -982,46 +782,52 @@ static int ceph_get_sb(struct file_system_type *fs_type, struct vfsmount *mnt) { struct super_block *sb; - struct ceph_client *client; + struct ceph_fs_client *fsc; int err; int (*compare_super)(struct super_block *, void *) = ceph_compare_super; const char *path = NULL; - struct ceph_mount_args *args; + struct ceph_mount_options *fsopt = NULL; + struct ceph_options *opt = NULL; dout("ceph_get_sb\n"); - args = parse_mount_args(flags, data, dev_name, &path); - if (IS_ERR(args)) { - err = PTR_ERR(args); + err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); + if (err < 0) goto out_final; - } /* create client (which we may/may not use) */ - client = ceph_create_client(args); - if (IS_ERR(client)) { - err = PTR_ERR(client); + fsc = create_fs_client(fsopt, opt); + if (IS_ERR(fsc)) { + err = PTR_ERR(fsc); + kfree(fsopt); + kfree(opt); goto out_final; } - if (client->mount_args->flags & CEPH_OPT_NOSHARE) + err = ceph_mdsc_init(fsc); + if (err < 0) + goto out; + + if (ceph_test_opt(fsc->client, NOSHARE)) compare_super = NULL; - sb = sget(fs_type, compare_super, ceph_set_super, client); + sb = sget(fs_type, compare_super, ceph_set_super, fsc); if (IS_ERR(sb)) { err = PTR_ERR(sb); goto out; } - if (ceph_sb_to_client(sb) != client) { - ceph_destroy_client(client); - client = ceph_sb_to_client(sb); - dout("get_sb got existing client %p\n", client); + if (ceph_sb_to_client(sb) != fsc) { + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); + fsc = ceph_sb_to_client(sb); + dout("get_sb got existing client %p\n", fsc); } else { - dout("get_sb using new client %p\n", client); - err = ceph_register_bdi(sb, client); + dout("get_sb using new client %p\n", fsc); + err = ceph_register_bdi(sb, fsc); if (err < 0) goto out_splat; } - err = ceph_mount(client, mnt, path); + err = ceph_mount(fsc, mnt, path); if (err < 0) goto out_splat; dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root, @@ -1029,12 +835,13 @@ static int ceph_get_sb(struct file_system_type *fs_type, return 0; out_splat: - ceph_mdsc_close_sessions(&client->mdsc); + ceph_mdsc_close_sessions(fsc->mdsc); deactivate_locked_super(sb); goto out_final; out: - ceph_destroy_client(client); + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); out_final: dout("ceph_get_sb fail %d\n", err); return err; @@ -1042,11 +849,12 @@ out_final: static void ceph_kill_sb(struct super_block *s) { - struct ceph_client *client = ceph_sb_to_client(s); + struct ceph_fs_client *fsc = ceph_sb_to_client(s); dout("kill_sb %p\n", s); - ceph_mdsc_pre_umount(&client->mdsc); + ceph_mdsc_pre_umount(fsc->mdsc); kill_anon_super(s); /* will call put_super after sb is r/o */ - ceph_destroy_client(client); + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); } static struct file_system_type ceph_fs_type = { @@ -1062,36 +870,20 @@ static struct file_system_type ceph_fs_type = { static int __init init_ceph(void) { - int ret = 0; - - ret = ceph_debugfs_init(); - if (ret < 0) - goto out; - - ret = ceph_msgr_init(); - if (ret < 0) - goto out_debugfs; - - ret = init_caches(); + int ret = init_caches(); if (ret) - goto out_msgr; + goto out; ret = register_filesystem(&ceph_fs_type); if (ret) goto out_icache; - pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n", - CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL, - CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, - CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); + pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); + return 0; out_icache: destroy_caches(); -out_msgr: - ceph_msgr_exit(); -out_debugfs: - ceph_debugfs_cleanup(); out: return ret; } @@ -1101,8 +893,6 @@ static void __exit exit_ceph(void) dout("exit_ceph\n"); unregister_filesystem(&ceph_fs_type); destroy_caches(); - ceph_msgr_exit(); - ceph_debugfs_cleanup(); } module_init(init_ceph); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b87638e84c4b..1886294e12f7 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1,7 +1,7 @@ #ifndef _FS_CEPH_SUPER_H #define _FS_CEPH_SUPER_H -#include "ceph_debug.h" +#include #include #include @@ -14,13 +14,7 @@ #include #include -#include "types.h" -#include "messenger.h" -#include "msgpool.h" -#include "mon_client.h" -#include "mds_client.h" -#include "osd_client.h" -#include "ceph_fs.h" +#include /* f_type in struct statfs */ #define CEPH_SUPER_MAGIC 0x00c36400 @@ -30,42 +24,25 @@ #define CEPH_BLOCK_SHIFT 20 /* 1 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) -/* - * Supported features - */ -#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK -#define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR +#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ +#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ +#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ -/* - * mount options - */ -#define CEPH_OPT_FSID (1<<0) -#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */ -#define CEPH_OPT_MYIP (1<<2) /* specified my ip */ -#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */ -#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ -#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */ -#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ +#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) -#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES) +#define ceph_set_mount_opt(fsc, opt) \ + (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; +#define ceph_test_mount_opt(fsc, opt) \ + (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) -#define ceph_set_opt(client, opt) \ - (client)->mount_args->flags |= CEPH_OPT_##opt; -#define ceph_test_opt(client, opt) \ - (!!((client)->mount_args->flags & CEPH_OPT_##opt)) +#define CEPH_MAX_READDIR_DEFAULT 1024 +#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) +#define CEPH_SNAPDIRNAME_DEFAULT ".snap" - -struct ceph_mount_args { - int sb_flags; +struct ceph_mount_options { int flags; - struct ceph_fsid fsid; - struct ceph_entity_addr my_addr; - int num_mon; - struct ceph_entity_addr *mon_addr; - int mount_timeout; - int osd_idle_ttl; - int osd_timeout; - int osd_keepalive_timeout; + int sb_flags; + int wsize; int rsize; /* max readahead */ int congestion_kb; /* max writeback in flight */ @@ -73,82 +50,25 @@ struct ceph_mount_args { int cap_release_safety; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ + + /* + * everything above this point can be memcmp'd; everything below + * is handled in compare_mount_options() + */ + char *snapdir_name; /* default ".snap" */ - char *name; - char *secret; }; -/* - * defaults - */ -#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 -#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ -#define CEPH_OSD_KEEPALIVE_DEFAULT 5 -#define CEPH_OSD_IDLE_TTL_DEFAULT 60 -#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ -#define CEPH_MAX_READDIR_DEFAULT 1024 -#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) - -#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) -#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) - -#define CEPH_SNAPDIRNAME_DEFAULT ".snap" -#define CEPH_AUTH_NAME_DEFAULT "guest" -/* - * Delay telling the MDS we no longer want caps, in case we reopen - * the file. Delay a minimum amount of time, even if we send a cap - * message for some other reason. Otherwise, take the oppotunity to - * update the mds to avoid sending another message later. - */ -#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ -#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ - -#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) - -/* mount state */ -enum { - CEPH_MOUNT_MOUNTING, - CEPH_MOUNT_MOUNTED, - CEPH_MOUNT_UNMOUNTING, - CEPH_MOUNT_UNMOUNTED, - CEPH_MOUNT_SHUTDOWN, -}; - -/* - * subtract jiffies - */ -static inline unsigned long time_sub(unsigned long a, unsigned long b) -{ - BUG_ON(time_after(b, a)); - return (long)a - (long)b; -} - -/* - * per-filesystem client state - * - * possibly shared by multiple mount points, if they are - * mounting the same ceph filesystem/cluster. - */ -struct ceph_client { - struct ceph_fsid fsid; - bool have_fsid; - - struct mutex mount_mutex; /* serialize mount attempts */ - struct ceph_mount_args *mount_args; - +struct ceph_fs_client { struct super_block *sb; + struct ceph_mount_options *mount_options; + struct ceph_client *client; + unsigned long mount_state; - wait_queue_head_t auth_wq; - - int auth_err; - int min_caps; /* min caps i added */ - struct ceph_messenger *msgr; /* messenger instance */ - struct ceph_mon_client monc; - struct ceph_mds_client mdsc; - struct ceph_osd_client osdc; + struct ceph_mds_client *mdsc; /* writeback */ mempool_t *wb_pagevec_pool; @@ -160,14 +80,14 @@ struct ceph_client { struct backing_dev_info backing_dev_info; #ifdef CONFIG_DEBUG_FS - struct dentry *debugfs_monmap; - struct dentry *debugfs_mdsmap, *debugfs_osdmap; - struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps; + struct dentry *debugfs_dentry_lru, *debugfs_caps; struct dentry *debugfs_congestion_kb; struct dentry *debugfs_bdi; + struct dentry *debugfs_mdsc, *debugfs_mdsmap; #endif }; + /* * File i/o capability. This tracks shared state with the metadata * server that allows us to cache or writeback attributes or to read @@ -275,6 +195,20 @@ struct ceph_inode_xattr { int should_free_val; }; +/* + * Ceph dentry state + */ +struct ceph_dentry_info { + struct ceph_mds_session *lease_session; + u32 lease_gen, lease_shared_gen; + u32 lease_seq; + unsigned long lease_renew_after, lease_renew_from; + struct list_head lru; + struct dentry *dentry; + u64 time; + u64 offset; +}; + struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing @@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info { /* * Ceph inode. */ -#define CEPH_I_COMPLETE 1 /* we have complete directory cached */ -#define CEPH_I_NODELAY 4 /* do not delay cap release */ -#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ - struct ceph_inode_info { struct ceph_vino i_vino; /* ceph ino + snap */ @@ -391,69 +320,9 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode) return container_of(inode, struct ceph_inode_info, vfs_inode); } -static inline void ceph_i_clear(struct inode *inode, unsigned mask) +static inline struct ceph_vino ceph_vino(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - - spin_lock(&inode->i_lock); - ci->i_ceph_flags &= ~mask; - spin_unlock(&inode->i_lock); -} - -static inline void ceph_i_set(struct inode *inode, unsigned mask) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - - spin_lock(&inode->i_lock); - ci->i_ceph_flags |= mask; - spin_unlock(&inode->i_lock); -} - -static inline bool ceph_i_test(struct inode *inode, unsigned mask) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - bool r; - - smp_mb(); - r = (ci->i_ceph_flags & mask) == mask; - return r; -} - - -/* find a specific frag @f */ -extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, - u32 f); - -/* - * choose fragment for value @v. copy frag content to pfrag, if leaf - * exists - */ -extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, - struct ceph_inode_frag *pfrag, - int *found); - -/* - * Ceph dentry state - */ -struct ceph_dentry_info { - struct ceph_mds_session *lease_session; - u32 lease_gen, lease_shared_gen; - u32 lease_seq; - unsigned long lease_renew_after, lease_renew_from; - struct list_head lru; - struct dentry *dentry; - u64 time; - u64 offset; -}; - -static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) -{ - return (struct ceph_dentry_info *)dentry->d_fsdata; -} - -static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) -{ - return ((loff_t)frag << 32) | (loff_t)off; + return ceph_inode(inode)->i_vino; } /* @@ -472,18 +341,6 @@ static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) return ino; } -static inline int ceph_set_ino_cb(struct inode *inode, void *data) -{ - ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; - inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); - return 0; -} - -static inline struct ceph_vino ceph_vino(struct inode *inode) -{ - return ceph_inode(inode)->i_vino; -} - /* for printf-style formatting */ #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap @@ -512,6 +369,73 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, } +/* + * Ceph inode. + */ +#define CEPH_I_COMPLETE 1 /* we have complete directory cached */ +#define CEPH_I_NODELAY 4 /* do not delay cap release */ +#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ +#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ + +static inline void ceph_i_clear(struct inode *inode, unsigned mask) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&inode->i_lock); + ci->i_ceph_flags &= ~mask; + spin_unlock(&inode->i_lock); +} + +static inline void ceph_i_set(struct inode *inode, unsigned mask) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&inode->i_lock); + ci->i_ceph_flags |= mask; + spin_unlock(&inode->i_lock); +} + +static inline bool ceph_i_test(struct inode *inode, unsigned mask) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + bool r; + + spin_lock(&inode->i_lock); + r = (ci->i_ceph_flags & mask) == mask; + spin_unlock(&inode->i_lock); + return r; +} + + +/* find a specific frag @f */ +extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, + u32 f); + +/* + * choose fragment for value @v. copy frag content to pfrag, if leaf + * exists + */ +extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, + int *found); + +static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) +{ + return (struct ceph_dentry_info *)dentry->d_fsdata; +} + +static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) +{ + return ((loff_t)frag << 32) | (loff_t)off; +} + +static inline int ceph_set_ino_cb(struct inode *inode, void *data) +{ + ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; + inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); + return 0; +} + /* * caps helpers */ @@ -576,18 +500,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); -extern void ceph_reservation_status(struct ceph_client *client, +extern void ceph_reservation_status(struct ceph_fs_client *client, int *total, int *avail, int *used, int *reserved, int *min); -static inline struct ceph_client *ceph_inode_to_client(struct inode *inode) +static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) { - return (struct ceph_client *)inode->i_sb->s_fs_info; + return (struct ceph_fs_client *)inode->i_sb->s_fs_info; } -static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb) +static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) { - return (struct ceph_client *)sb->s_fs_info; + return (struct ceph_fs_client *)sb->s_fs_info; } @@ -616,51 +540,6 @@ struct ceph_file_info { -/* - * snapshots - */ - -/* - * A "snap context" is the set of existing snapshots when we - * write data. It is used by the OSD to guide its COW behavior. - * - * The ceph_snap_context is refcounted, and attached to each dirty - * page, indicating which context the dirty data belonged when it was - * dirtied. - */ -struct ceph_snap_context { - atomic_t nref; - u64 seq; - int num_snaps; - u64 snaps[]; -}; - -static inline struct ceph_snap_context * -ceph_get_snap_context(struct ceph_snap_context *sc) -{ - /* - printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), - atomic_read(&sc->nref)+1); - */ - if (sc) - atomic_inc(&sc->nref); - return sc; -} - -static inline void ceph_put_snap_context(struct ceph_snap_context *sc) -{ - if (!sc) - return; - /* - printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), - atomic_read(&sc->nref)-1); - */ - if (atomic_dec_and_test(&sc->nref)) { - /*printk(" deleting snap_context %p\n", sc);*/ - kfree(sc); - } -} - /* * A "snap realm" describes a subset of the file hierarchy sharing * the same set of snapshots that apply to it. The realms themselves @@ -699,16 +578,33 @@ struct ceph_snap_realm { spinlock_t inodes_with_caps_lock; }; - - -/* - * calculate the number of pages a given length and offset map onto, - * if we align the data. - */ -static inline int calc_pages_for(u64 off, u64 len) +static inline int default_congestion_kb(void) { - return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - - (off >> PAGE_CACHE_SHIFT); + int congestion_kb; + + /* + * Copied from NFS + * + * congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + if (congestion_kb > 256*1024) + congestion_kb = 256*1024; + + return congestion_kb; } @@ -741,16 +637,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) ci_item)->writing; } - -/* super.c */ -extern struct kmem_cache *ceph_inode_cachep; -extern struct kmem_cache *ceph_cap_cachep; -extern struct kmem_cache *ceph_dentry_cachep; -extern struct kmem_cache *ceph_file_cachep; - -extern const char *ceph_msg_type_name(int type); -extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); - /* inode.c */ extern const struct inode_operations ceph_file_iops; @@ -857,12 +743,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); /* file.c */ extern const struct file_operations ceph_file_fops; extern const struct address_space_operations ceph_aops; +extern int ceph_copy_to_page_vector(struct page **pages, + const char *data, + loff_t off, size_t len); +extern int ceph_copy_from_page_vector(struct page **pages, + char *data, + loff_t off, size_t len); +extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); extern int ceph_open(struct inode *inode, struct file *file); extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd, int mode, int locked_dir); extern int ceph_release(struct inode *inode, struct file *filp); -extern void ceph_release_page_vector(struct page **pages, int num_pages); /* dir.c */ extern const struct file_operations ceph_dir_fops; @@ -892,12 +784,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* export.c */ extern const struct export_operations ceph_export_ops; -/* debugfs.c */ -extern int ceph_debugfs_init(void); -extern void ceph_debugfs_cleanup(void); -extern int ceph_debugfs_client_init(struct ceph_client *client); -extern void ceph_debugfs_client_cleanup(struct ceph_client *client); - /* locks.c */ extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); @@ -914,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) return NULL; } +/* debugfs.c */ +extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); +extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); + #endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 9578af610b73..6e12a6ba5f79 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1,6 +1,9 @@ -#include "ceph_debug.h" +#include + #include "super.h" -#include "decode.h" +#include "mds_client.h" + +#include #include #include @@ -620,12 +623,12 @@ out: static int ceph_sync_setxattr(struct dentry *dentry, const char *name, const char *value, size_t size, int flags) { - struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct inode *parent_inode = dentry->d_parent->d_inode; struct ceph_mds_request *req; - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_client *mdsc = fsc->mdsc; int err; int i, nr_pages; struct page **pages = NULL; @@ -713,10 +716,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name, /* preallocate memory for xattr name, value, index node */ err = -ENOMEM; - newname = kmalloc(name_len + 1, GFP_NOFS); + newname = kmemdup(name, name_len + 1, GFP_NOFS); if (!newname) goto out; - memcpy(newname, name, name_len + 1); if (val_len) { newval = kmalloc(val_len + 1, GFP_NOFS); @@ -777,8 +779,8 @@ out: static int ceph_send_removexattr(struct dentry *dentry, const char *name) { - struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; struct inode *parent_inode = dentry->d_parent->d_inode; struct ceph_mds_request *req; diff --git a/fs/ceph/auth.h b/include/linux/ceph/auth.h similarity index 97% rename from fs/ceph/auth.h rename to include/linux/ceph/auth.h index d38a2fb4a137..7fff521d7eb5 100644 --- a/fs/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -1,8 +1,8 @@ #ifndef _FS_CEPH_AUTH_H #define _FS_CEPH_AUTH_H -#include "types.h" -#include "buffer.h" +#include +#include /* * Abstract interface for communicating with the authenticate module. diff --git a/fs/ceph/buffer.h b/include/linux/ceph/buffer.h similarity index 100% rename from fs/ceph/buffer.h rename to include/linux/ceph/buffer.h diff --git a/fs/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h similarity index 86% rename from fs/ceph/ceph_debug.h rename to include/linux/ceph/ceph_debug.h index 1818c2305610..aa2e19182d99 100644 --- a/fs/ceph/ceph_debug.h +++ b/include/linux/ceph/ceph_debug.h @@ -3,7 +3,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#ifdef CONFIG_CEPH_FS_PRETTYDEBUG +#ifdef CONFIG_CEPH_LIB_PRETTYDEBUG /* * wrap pr_debug to include a filename:lineno prefix on each line. @@ -14,7 +14,8 @@ # if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) extern const char *ceph_file_part(const char *s, int len); # define dout(fmt, ...) \ - pr_debug(" %12.12s:%-4d : " fmt, \ + pr_debug("%.*s %12.12s:%-4d : " fmt, \ + 8 - (int)sizeof(KBUILD_MODNAME), " ", \ ceph_file_part(__FILE__, sizeof(__FILE__)), \ __LINE__, ##__VA_ARGS__) # else diff --git a/fs/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h similarity index 100% rename from fs/ceph/ceph_frag.h rename to include/linux/ceph/ceph_frag.h diff --git a/fs/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h similarity index 99% rename from fs/ceph/ceph_fs.h rename to include/linux/ceph/ceph_fs.h index d5619ac86711..c3c74aef289d 100644 --- a/fs/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -299,6 +299,7 @@ enum { CEPH_MDS_OP_SETATTR = 0x01108, CEPH_MDS_OP_SETFILELOCK= 0x01109, CEPH_MDS_OP_GETFILELOCK= 0x00110, + CEPH_MDS_OP_SETDIRLAYOUT=0x0110a, CEPH_MDS_OP_MKNOD = 0x01201, CEPH_MDS_OP_LINK = 0x01202, diff --git a/fs/ceph/ceph_hash.h b/include/linux/ceph/ceph_hash.h similarity index 100% rename from fs/ceph/ceph_hash.h rename to include/linux/ceph/ceph_hash.h diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h new file mode 100644 index 000000000000..2a79702e092b --- /dev/null +++ b/include/linux/ceph/debugfs.h @@ -0,0 +1,33 @@ +#ifndef _FS_CEPH_DEBUGFS_H +#define _FS_CEPH_DEBUGFS_H + +#include "ceph_debug.h" +#include "types.h" + +#define CEPH_DEFINE_SHOW_FUNC(name) \ +static int name##_open(struct inode *inode, struct file *file) \ +{ \ + struct seq_file *sf; \ + int ret; \ + \ + ret = single_open(file, name, NULL); \ + sf = file->private_data; \ + sf->private = inode->i_private; \ + return ret; \ +} \ + \ +static const struct file_operations name##_fops = { \ + .open = name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +/* debugfs.c */ +extern int ceph_debugfs_init(void); +extern void ceph_debugfs_cleanup(void); +extern int ceph_debugfs_client_init(struct ceph_client *client); +extern void ceph_debugfs_client_cleanup(struct ceph_client *client); + +#endif + diff --git a/fs/ceph/decode.h b/include/linux/ceph/decode.h similarity index 96% rename from fs/ceph/decode.h rename to include/linux/ceph/decode.h index 3d25415afe63..c5b6939fb32a 100644 --- a/fs/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -191,6 +191,11 @@ static inline void ceph_encode_string(void **p, void *end, ceph_encode_need(p, end, n, bad); \ ceph_encode_copy(p, pv, n); \ } while (0) +#define ceph_encode_string_safe(p, end, s, n, bad) \ + do { \ + ceph_encode_need(p, end, n, bad); \ + ceph_encode_string(p, end, s, n); \ + } while (0) #endif diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h new file mode 100644 index 000000000000..f22b2e941686 --- /dev/null +++ b/include/linux/ceph/libceph.h @@ -0,0 +1,249 @@ +#ifndef _FS_CEPH_LIBCEPH_H +#define _FS_CEPH_LIBCEPH_H + +#include "ceph_debug.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "messenger.h" +#include "msgpool.h" +#include "mon_client.h" +#include "osd_client.h" +#include "ceph_fs.h" + +/* + * Supported features + */ +#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED_DEFAULT CEPH_FEATURE_NOSRCADDR + +/* + * mount options + */ +#define CEPH_OPT_FSID (1<<0) +#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */ +#define CEPH_OPT_MYIP (1<<2) /* specified my ip */ +#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */ + +#define CEPH_OPT_DEFAULT (0); + +#define ceph_set_opt(client, opt) \ + (client)->options->flags |= CEPH_OPT_##opt; +#define ceph_test_opt(client, opt) \ + (!!((client)->options->flags & CEPH_OPT_##opt)) + +struct ceph_options { + int flags; + struct ceph_fsid fsid; + struct ceph_entity_addr my_addr; + int mount_timeout; + int osd_idle_ttl; + int osd_timeout; + int osd_keepalive_timeout; + + /* + * any type that can't be simply compared or doesn't need need + * to be compared should go beyond this point, + * ceph_compare_options() should be updated accordingly + */ + + struct ceph_entity_addr *mon_addr; /* should be the first + pointer type of args */ + int num_mon; + char *name; + char *secret; +}; + +/* + * defaults + */ +#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 +#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ +#define CEPH_OSD_KEEPALIVE_DEFAULT 5 +#define CEPH_OSD_IDLE_TTL_DEFAULT 60 +#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ + +#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) +#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) + +#define CEPH_AUTH_NAME_DEFAULT "guest" + +/* + * Delay telling the MDS we no longer want caps, in case we reopen + * the file. Delay a minimum amount of time, even if we send a cap + * message for some other reason. Otherwise, take the oppotunity to + * update the mds to avoid sending another message later. + */ +#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ +#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ + +#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) + +/* mount state */ +enum { + CEPH_MOUNT_MOUNTING, + CEPH_MOUNT_MOUNTED, + CEPH_MOUNT_UNMOUNTING, + CEPH_MOUNT_UNMOUNTED, + CEPH_MOUNT_SHUTDOWN, +}; + +/* + * subtract jiffies + */ +static inline unsigned long time_sub(unsigned long a, unsigned long b) +{ + BUG_ON(time_after(b, a)); + return (long)a - (long)b; +} + +struct ceph_mds_client; + +/* + * per client state + * + * possibly shared by multiple mount points, if they are + * mounting the same ceph filesystem/cluster. + */ +struct ceph_client { + struct ceph_fsid fsid; + bool have_fsid; + + void *private; + + struct ceph_options *options; + + struct mutex mount_mutex; /* serialize mount attempts */ + wait_queue_head_t auth_wq; + int auth_err; + + int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *); + + u32 supported_features; + u32 required_features; + + struct ceph_messenger *msgr; /* messenger instance */ + struct ceph_mon_client monc; + struct ceph_osd_client osdc; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_dir; + struct dentry *debugfs_monmap; + struct dentry *debugfs_osdmap; +#endif +}; + + + +/* + * snapshots + */ + +/* + * A "snap context" is the set of existing snapshots when we + * write data. It is used by the OSD to guide its COW behavior. + * + * The ceph_snap_context is refcounted, and attached to each dirty + * page, indicating which context the dirty data belonged when it was + * dirtied. + */ +struct ceph_snap_context { + atomic_t nref; + u64 seq; + int num_snaps; + u64 snaps[]; +}; + +static inline struct ceph_snap_context * +ceph_get_snap_context(struct ceph_snap_context *sc) +{ + /* + printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), + atomic_read(&sc->nref)+1); + */ + if (sc) + atomic_inc(&sc->nref); + return sc; +} + +static inline void ceph_put_snap_context(struct ceph_snap_context *sc) +{ + if (!sc) + return; + /* + printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), + atomic_read(&sc->nref)-1); + */ + if (atomic_dec_and_test(&sc->nref)) { + /*printk(" deleting snap_context %p\n", sc);*/ + kfree(sc); + } +} + +/* + * calculate the number of pages a given length and offset map onto, + * if we align the data. + */ +static inline int calc_pages_for(u64 off, u64 len) +{ + return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - + (off >> PAGE_CACHE_SHIFT); +} + +/* ceph_common.c */ +extern const char *ceph_msg_type_name(int type); +extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); +extern struct kmem_cache *ceph_inode_cachep; +extern struct kmem_cache *ceph_cap_cachep; +extern struct kmem_cache *ceph_dentry_cachep; +extern struct kmem_cache *ceph_file_cachep; + +extern int ceph_parse_options(struct ceph_options **popt, char *options, + const char *dev_name, const char *dev_name_end, + int (*parse_extra_token)(char *c, void *private), + void *private); +extern void ceph_destroy_options(struct ceph_options *opt); +extern int ceph_compare_options(struct ceph_options *new_opt, + struct ceph_client *client); +extern struct ceph_client *ceph_create_client(struct ceph_options *opt, + void *private); +extern u64 ceph_client_id(struct ceph_client *client); +extern void ceph_destroy_client(struct ceph_client *client); +extern int __ceph_open_session(struct ceph_client *client, + unsigned long started); +extern int ceph_open_session(struct ceph_client *client); + +/* pagevec.c */ +extern void ceph_release_page_vector(struct page **pages, int num_pages); + +extern struct page **ceph_get_direct_page_vector(const char __user *data, + int num_pages, + loff_t off, size_t len); +extern void ceph_put_page_vector(struct page **pages, int num_pages); +extern void ceph_release_page_vector(struct page **pages, int num_pages); +extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); +extern int ceph_copy_user_to_page_vector(struct page **pages, + const char __user *data, + loff_t off, size_t len); +extern int ceph_copy_to_page_vector(struct page **pages, + const char *data, + loff_t off, size_t len); +extern int ceph_copy_from_page_vector(struct page **pages, + char *data, + loff_t off, size_t len); +extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data, + loff_t off, size_t len); +extern void ceph_zero_page_vector_range(int off, int len, struct page **pages); + + +#endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h similarity index 100% rename from fs/ceph/mdsmap.h rename to include/linux/ceph/mdsmap.h diff --git a/fs/ceph/messenger.h b/include/linux/ceph/messenger.h similarity index 95% rename from fs/ceph/messenger.h rename to include/linux/ceph/messenger.h index 76fbc957bc13..5956d62c3057 100644 --- a/fs/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -65,6 +65,9 @@ struct ceph_messenger { */ u32 global_seq; spinlock_t global_seq_lock; + + u32 supported_features; + u32 required_features; }; /* @@ -82,6 +85,10 @@ struct ceph_msg { struct ceph_pagelist *pagelist; /* instead of pages */ struct list_head list_head; struct kref kref; + struct bio *bio; /* instead of pages/pagelist */ + struct bio *bio_iter; /* bio iterator */ + int bio_seg; /* current bio segment */ + struct ceph_pagelist *trail; /* the trailing part of the data */ bool front_is_vmalloc; bool more_to_follow; bool needs_out_seq; @@ -205,7 +212,7 @@ struct ceph_connection { }; -extern const char *pr_addr(const struct sockaddr_storage *ss); +extern const char *ceph_pr_addr(const struct sockaddr_storage *ss); extern int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, int max_count, int *count); @@ -216,7 +223,8 @@ extern void ceph_msgr_exit(void); extern void ceph_msgr_flush(void); extern struct ceph_messenger *ceph_messenger_create( - struct ceph_entity_addr *myaddr); + struct ceph_entity_addr *myaddr, + u32 features, u32 required); extern void ceph_messenger_destroy(struct ceph_messenger *); extern void ceph_con_init(struct ceph_messenger *msgr, diff --git a/fs/ceph/mon_client.h b/include/linux/ceph/mon_client.h similarity index 99% rename from fs/ceph/mon_client.h rename to include/linux/ceph/mon_client.h index 8e396f2c0963..545f85917780 100644 --- a/fs/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -79,6 +79,7 @@ struct ceph_mon_client { u64 last_tid; /* mds/osd map */ + int want_mdsmap; int want_next_osdmap; /* 1 = want, 2 = want+asked */ u32 have_osdmap, have_mdsmap; diff --git a/fs/ceph/msgpool.h b/include/linux/ceph/msgpool.h similarity index 100% rename from fs/ceph/msgpool.h rename to include/linux/ceph/msgpool.h diff --git a/fs/ceph/msgr.h b/include/linux/ceph/msgr.h similarity index 100% rename from fs/ceph/msgr.h rename to include/linux/ceph/msgr.h diff --git a/fs/ceph/osd_client.h b/include/linux/ceph/osd_client.h similarity index 76% rename from fs/ceph/osd_client.h rename to include/linux/ceph/osd_client.h index ce776989ef6a..6c91fb032c39 100644 --- a/fs/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -15,6 +15,7 @@ struct ceph_snap_context; struct ceph_osd_request; struct ceph_osd_client; struct ceph_authorizer; +struct ceph_pagelist; /* * completion callback for async writepages @@ -68,6 +69,7 @@ struct ceph_osd_request { struct list_head r_unsafe_item; struct inode *r_inode; /* for use by callbacks */ + void *r_priv; /* ditto */ char r_oid[40]; /* object name */ int r_oid_len; @@ -80,6 +82,11 @@ struct ceph_osd_request { struct page **r_pages; /* pages for data payload */ int r_pages_from_pool; int r_own_pages; /* if true, i own page list */ +#ifdef CONFIG_BLOCK + struct bio *r_bio; /* instead of pages */ +#endif + + struct ceph_pagelist *r_trail; /* trailing part of the data */ }; struct ceph_osd_client { @@ -110,6 +117,42 @@ struct ceph_osd_client { struct ceph_msgpool msgpool_op_reply; }; +struct ceph_osd_req_op { + u16 op; /* CEPH_OSD_OP_* */ + u32 flags; /* CEPH_OSD_FLAG_* */ + union { + struct { + u64 offset, length; + u64 truncate_size; + u32 truncate_seq; + } extent; + struct { + const char *name; + u32 name_len; + const char *val; + u32 value_len; + __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ + __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ + } xattr; + struct { + const char *class_name; + __u8 class_len; + const char *method_name; + __u8 method_len; + __u8 argc; + const char *indata; + u32 indata_len; + } cls; + struct { + u64 cookie, count; + } pgls; + struct { + u64 snapid; + } snap; + }; + u32 payload_len; +}; + extern int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client); extern void ceph_osdc_stop(struct ceph_osd_client *osdc); @@ -119,6 +162,30 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); +extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc, + struct ceph_file_layout *layout, + u64 snapid, + u64 off, u64 *plen, u64 *bno, + struct ceph_osd_request *req, + struct ceph_osd_req_op *op); + +extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, + int flags, + struct ceph_snap_context *snapc, + struct ceph_osd_req_op *ops, + bool use_mempool, + gfp_t gfp_flags, + struct page **pages, + struct bio *bio); + +extern void ceph_osdc_build_request(struct ceph_osd_request *req, + u64 off, u64 *plen, + struct ceph_osd_req_op *src_ops, + struct ceph_snap_context *snapc, + struct timespec *mtime, + const char *oid, + int oid_len); + extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, struct ceph_vino vino, diff --git a/fs/ceph/osdmap.h b/include/linux/ceph/osdmap.h similarity index 97% rename from fs/ceph/osdmap.h rename to include/linux/ceph/osdmap.h index 970b547e510d..ba4c205cbb01 100644 --- a/fs/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -4,7 +4,7 @@ #include #include "types.h" #include "ceph_fs.h" -#include "crush/crush.h" +#include /* * The osd map describes the current membership of the osd cluster and @@ -125,4 +125,6 @@ extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid); +extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); + #endif diff --git a/fs/ceph/pagelist.h b/include/linux/ceph/pagelist.h similarity index 62% rename from fs/ceph/pagelist.h rename to include/linux/ceph/pagelist.h index e8a4187e1087..9660d6b0a35d 100644 --- a/fs/ceph/pagelist.h +++ b/include/linux/ceph/pagelist.h @@ -8,6 +8,14 @@ struct ceph_pagelist { void *mapped_tail; size_t length; size_t room; + struct list_head free_list; + size_t num_pages_free; +}; + +struct ceph_pagelist_cursor { + struct ceph_pagelist *pl; /* pagelist, for error checking */ + struct list_head *page_lru; /* page in list */ + size_t room; /* room remaining to reset to */ }; static inline void ceph_pagelist_init(struct ceph_pagelist *pl) @@ -16,10 +24,23 @@ static inline void ceph_pagelist_init(struct ceph_pagelist *pl) pl->mapped_tail = NULL; pl->length = 0; pl->room = 0; + INIT_LIST_HEAD(&pl->free_list); + pl->num_pages_free = 0; } + extern int ceph_pagelist_release(struct ceph_pagelist *pl); -extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l); +extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l); + +extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space); + +extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl); + +extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c); + +extern int ceph_pagelist_truncate(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c); static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v) { diff --git a/fs/ceph/rados.h b/include/linux/ceph/rados.h similarity index 100% rename from fs/ceph/rados.h rename to include/linux/ceph/rados.h diff --git a/fs/ceph/types.h b/include/linux/ceph/types.h similarity index 100% rename from fs/ceph/types.h rename to include/linux/ceph/types.h diff --git a/fs/ceph/crush/crush.h b/include/linux/crush/crush.h similarity index 100% rename from fs/ceph/crush/crush.h rename to include/linux/crush/crush.h diff --git a/fs/ceph/crush/hash.h b/include/linux/crush/hash.h similarity index 100% rename from fs/ceph/crush/hash.h rename to include/linux/crush/hash.h diff --git a/fs/ceph/crush/mapper.h b/include/linux/crush/mapper.h similarity index 100% rename from fs/ceph/crush/mapper.h rename to include/linux/crush/mapper.h diff --git a/net/Kconfig b/net/Kconfig index e926884c1675..55fd82e9ffd9 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -293,6 +293,7 @@ source "net/wimax/Kconfig" source "net/rfkill/Kconfig" source "net/9p/Kconfig" source "net/caif/Kconfig" +source "net/ceph/Kconfig" endif # if NET diff --git a/net/Makefile b/net/Makefile index ea60fbce9b1b..6b7bfd7f1416 100644 --- a/net/Makefile +++ b/net/Makefile @@ -68,3 +68,4 @@ obj-$(CONFIG_SYSCTL) += sysctl_net.o endif obj-$(CONFIG_WIMAX) += wimax/ obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/ +obj-$(CONFIG_CEPH_LIB) += ceph/ diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig new file mode 100644 index 000000000000..ad424049b0cf --- /dev/null +++ b/net/ceph/Kconfig @@ -0,0 +1,28 @@ +config CEPH_LIB + tristate "Ceph core library (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + select LIBCRC32C + select CRYPTO_AES + select CRYPTO + default n + help + Choose Y or M here to include cephlib, which provides the + common functionality to both the Ceph filesystem and + to the rados block device (rbd). + + More information at http://ceph.newdream.net/. + + If unsure, say N. + +config CEPH_LIB_PRETTYDEBUG + bool "Include file:line in ceph debug output" + depends on CEPH_LIB + default n + help + If you say Y here, debug output will include a filename and + line to aid debugging. This increases kernel size and slows + execution slightly when debug call sites are enabled (e.g., + via CONFIG_DYNAMIC_DEBUG). + + If unsure, say N. + diff --git a/net/ceph/Makefile b/net/ceph/Makefile new file mode 100644 index 000000000000..aab1cabb8035 --- /dev/null +++ b/net/ceph/Makefile @@ -0,0 +1,37 @@ +# +# Makefile for CEPH filesystem. +# + +ifneq ($(KERNELRELEASE),) + +obj-$(CONFIG_CEPH_LIB) += libceph.o + +libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ + mon_client.o \ + osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ + debugfs.o \ + auth.o auth_none.o \ + crypto.o armor.o \ + auth_x.o \ + ceph_fs.o ceph_strings.o ceph_hash.o \ + pagevec.o + +else +#Otherwise we were called directly from the command +# line; invoke the kernel build system. + +KERNELDIR ?= /lib/modules/$(shell uname -r)/build +PWD := $(shell pwd) + +default: all + +all: + $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules + +modules_install: + $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install + +clean: + $(MAKE) -C $(KERNELDIR) M=$(PWD) clean + +endif diff --git a/fs/ceph/armor.c b/net/ceph/armor.c similarity index 100% rename from fs/ceph/armor.c rename to net/ceph/armor.c diff --git a/fs/ceph/auth.c b/net/ceph/auth.c similarity index 97% rename from fs/ceph/auth.c rename to net/ceph/auth.c index 6d2e30600627..549c1f43e1d5 100644 --- a/fs/ceph/auth.c +++ b/net/ceph/auth.c @@ -1,16 +1,16 @@ -#include "ceph_debug.h" +#include #include #include #include -#include "types.h" +#include +#include +#include +#include #include "auth_none.h" #include "auth_x.h" -#include "decode.h" -#include "super.h" -#include "messenger.h" /* * get protocol handler diff --git a/fs/ceph/auth_none.c b/net/ceph/auth_none.c similarity index 96% rename from fs/ceph/auth_none.c rename to net/ceph/auth_none.c index ad1dc21286c7..214c2bb43d62 100644 --- a/fs/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -1,14 +1,15 @@ -#include "ceph_debug.h" +#include #include #include #include #include +#include +#include + #include "auth_none.h" -#include "auth.h" -#include "decode.h" static void reset(struct ceph_auth_client *ac) { diff --git a/fs/ceph/auth_none.h b/net/ceph/auth_none.h similarity index 94% rename from fs/ceph/auth_none.h rename to net/ceph/auth_none.h index 8164df1a08be..ed7d088b1bc9 100644 --- a/fs/ceph/auth_none.h +++ b/net/ceph/auth_none.h @@ -2,8 +2,7 @@ #define _FS_CEPH_AUTH_NONE_H #include - -#include "auth.h" +#include /* * null security mode. diff --git a/fs/ceph/auth_x.c b/net/ceph/auth_x.c similarity index 99% rename from fs/ceph/auth_x.c rename to net/ceph/auth_x.c index a2d002cbdec2..7fd5dfcf6e18 100644 --- a/fs/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -1,16 +1,17 @@ -#include "ceph_debug.h" +#include #include #include #include #include +#include +#include + +#include "crypto.h" #include "auth_x.h" #include "auth_x_protocol.h" -#include "crypto.h" -#include "auth.h" -#include "decode.h" #define TEMP_TICKET_BUF_LEN 256 diff --git a/fs/ceph/auth_x.h b/net/ceph/auth_x.h similarity index 96% rename from fs/ceph/auth_x.h rename to net/ceph/auth_x.h index ff6f8180e681..e02da7a5c5a1 100644 --- a/fs/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -3,8 +3,9 @@ #include +#include + #include "crypto.h" -#include "auth.h" #include "auth_x_protocol.h" /* diff --git a/fs/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h similarity index 100% rename from fs/ceph/auth_x_protocol.h rename to net/ceph/auth_x_protocol.h diff --git a/fs/ceph/buffer.c b/net/ceph/buffer.c similarity index 86% rename from fs/ceph/buffer.c rename to net/ceph/buffer.c index cd39f17021de..53d8abfa25d5 100644 --- a/fs/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -1,10 +1,11 @@ -#include "ceph_debug.h" +#include +#include #include -#include "buffer.h" -#include "decode.h" +#include +#include struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) { @@ -32,6 +33,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) dout("buffer_new %p\n", b); return b; } +EXPORT_SYMBOL(ceph_buffer_new); void ceph_buffer_release(struct kref *kref) { @@ -46,6 +48,7 @@ void ceph_buffer_release(struct kref *kref) } kfree(b); } +EXPORT_SYMBOL(ceph_buffer_release); int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end) { diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c new file mode 100644 index 000000000000..f3e4a13fea0c --- /dev/null +++ b/net/ceph/ceph_common.c @@ -0,0 +1,529 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include + + + +/* + * find filename portion of a path (/foo/bar/baz -> baz) + */ +const char *ceph_file_part(const char *s, int len) +{ + const char *e = s + len; + + while (e != s && *(e-1) != '/') + e--; + return e; +} +EXPORT_SYMBOL(ceph_file_part); + +const char *ceph_msg_type_name(int type) +{ + switch (type) { + case CEPH_MSG_SHUTDOWN: return "shutdown"; + case CEPH_MSG_PING: return "ping"; + case CEPH_MSG_AUTH: return "auth"; + case CEPH_MSG_AUTH_REPLY: return "auth_reply"; + case CEPH_MSG_MON_MAP: return "mon_map"; + case CEPH_MSG_MON_GET_MAP: return "mon_get_map"; + case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe"; + case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; + case CEPH_MSG_STATFS: return "statfs"; + case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; + case CEPH_MSG_MDS_MAP: return "mds_map"; + case CEPH_MSG_CLIENT_SESSION: return "client_session"; + case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; + case CEPH_MSG_CLIENT_REQUEST: return "client_request"; + case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward"; + case CEPH_MSG_CLIENT_REPLY: return "client_reply"; + case CEPH_MSG_CLIENT_CAPS: return "client_caps"; + case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; + case CEPH_MSG_CLIENT_SNAP: return "client_snap"; + case CEPH_MSG_CLIENT_LEASE: return "client_lease"; + case CEPH_MSG_OSD_MAP: return "osd_map"; + case CEPH_MSG_OSD_OP: return "osd_op"; + case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; + default: return "unknown"; + } +} +EXPORT_SYMBOL(ceph_msg_type_name); + +/* + * Initially learn our fsid, or verify an fsid matches. + */ +int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) +{ + if (client->have_fsid) { + if (ceph_fsid_compare(&client->fsid, fsid)) { + pr_err("bad fsid, had %pU got %pU", + &client->fsid, fsid); + return -1; + } + } else { + pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid); + memcpy(&client->fsid, fsid, sizeof(*fsid)); + ceph_debugfs_client_init(client); + client->have_fsid = true; + } + return 0; +} +EXPORT_SYMBOL(ceph_check_fsid); + +static int strcmp_null(const char *s1, const char *s2) +{ + if (!s1 && !s2) + return 0; + if (s1 && !s2) + return -1; + if (!s1 && s2) + return 1; + return strcmp(s1, s2); +} + +int ceph_compare_options(struct ceph_options *new_opt, + struct ceph_client *client) +{ + struct ceph_options *opt1 = new_opt; + struct ceph_options *opt2 = client->options; + int ofs = offsetof(struct ceph_options, mon_addr); + int i; + int ret; + + ret = memcmp(opt1, opt2, ofs); + if (ret) + return ret; + + ret = strcmp_null(opt1->name, opt2->name); + if (ret) + return ret; + + ret = strcmp_null(opt1->secret, opt2->secret); + if (ret) + return ret; + + /* any matching mon ip implies a match */ + for (i = 0; i < opt1->num_mon; i++) { + if (ceph_monmap_contains(client->monc.monmap, + &opt1->mon_addr[i])) + return 0; + } + return -1; +} +EXPORT_SYMBOL(ceph_compare_options); + + +static int parse_fsid(const char *str, struct ceph_fsid *fsid) +{ + int i = 0; + char tmp[3]; + int err = -EINVAL; + int d; + + dout("parse_fsid '%s'\n", str); + tmp[2] = 0; + while (*str && i < 16) { + if (ispunct(*str)) { + str++; + continue; + } + if (!isxdigit(str[0]) || !isxdigit(str[1])) + break; + tmp[0] = str[0]; + tmp[1] = str[1]; + if (sscanf(tmp, "%x", &d) < 1) + break; + fsid->fsid[i] = d & 0xff; + i++; + str += 2; + } + + if (i == 16) + err = 0; + dout("parse_fsid ret %d got fsid %pU", err, fsid); + return err; +} + +/* + * ceph options + */ +enum { + Opt_osdtimeout, + Opt_osdkeepalivetimeout, + Opt_mount_timeout, + Opt_osd_idle_ttl, + Opt_last_int, + /* int args above */ + Opt_fsid, + Opt_name, + Opt_secret, + Opt_ip, + Opt_last_string, + /* string args above */ + Opt_noshare, + Opt_nocrc, +}; + +static match_table_t opt_tokens = { + {Opt_osdtimeout, "osdtimeout=%d"}, + {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, + {Opt_mount_timeout, "mount_timeout=%d"}, + {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, + /* int args above */ + {Opt_fsid, "fsid=%s"}, + {Opt_name, "name=%s"}, + {Opt_secret, "secret=%s"}, + {Opt_ip, "ip=%s"}, + /* string args above */ + {Opt_noshare, "noshare"}, + {Opt_nocrc, "nocrc"}, + {-1, NULL} +}; + +void ceph_destroy_options(struct ceph_options *opt) +{ + dout("destroy_options %p\n", opt); + kfree(opt->name); + kfree(opt->secret); + kfree(opt); +} +EXPORT_SYMBOL(ceph_destroy_options); + +int ceph_parse_options(struct ceph_options **popt, char *options, + const char *dev_name, const char *dev_name_end, + int (*parse_extra_token)(char *c, void *private), + void *private) +{ + struct ceph_options *opt; + const char *c; + int err = -ENOMEM; + substring_t argstr[MAX_OPT_ARGS]; + + opt = kzalloc(sizeof(*opt), GFP_KERNEL); + if (!opt) + return err; + opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), + GFP_KERNEL); + if (!opt->mon_addr) + goto out; + + dout("parse_options %p options '%s' dev_name '%s'\n", opt, options, + dev_name); + + /* start with defaults */ + opt->flags = CEPH_OPT_DEFAULT; + opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; + opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; + opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ + opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ + + /* get mon ip(s) */ + /* ip1[:port1][,ip2[:port2]...] */ + err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr, + CEPH_MAX_MON, &opt->num_mon); + if (err < 0) + goto out; + + /* parse mount options */ + while ((c = strsep(&options, ",")) != NULL) { + int token, intval, ret; + if (!*c) + continue; + err = -EINVAL; + token = match_token((char *)c, opt_tokens, argstr); + if (token < 0 && parse_extra_token) { + /* extra? */ + err = parse_extra_token((char *)c, private); + if (err < 0) { + pr_err("bad option at '%s'\n", c); + goto out; + } + continue; + } + if (token < Opt_last_int) { + ret = match_int(&argstr[0], &intval); + if (ret < 0) { + pr_err("bad mount option arg (not int) " + "at '%s'\n", c); + continue; + } + dout("got int token %d val %d\n", token, intval); + } else if (token > Opt_last_int && token < Opt_last_string) { + dout("got string token %d val %s\n", token, + argstr[0].from); + } else { + dout("got token %d\n", token); + } + switch (token) { + case Opt_ip: + err = ceph_parse_ips(argstr[0].from, + argstr[0].to, + &opt->my_addr, + 1, NULL); + if (err < 0) + goto out; + opt->flags |= CEPH_OPT_MYIP; + break; + + case Opt_fsid: + err = parse_fsid(argstr[0].from, &opt->fsid); + if (err == 0) + opt->flags |= CEPH_OPT_FSID; + break; + case Opt_name: + opt->name = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + break; + case Opt_secret: + opt->secret = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + break; + + /* misc */ + case Opt_osdtimeout: + opt->osd_timeout = intval; + break; + case Opt_osdkeepalivetimeout: + opt->osd_keepalive_timeout = intval; + break; + case Opt_osd_idle_ttl: + opt->osd_idle_ttl = intval; + break; + case Opt_mount_timeout: + opt->mount_timeout = intval; + break; + + case Opt_noshare: + opt->flags |= CEPH_OPT_NOSHARE; + break; + + case Opt_nocrc: + opt->flags |= CEPH_OPT_NOCRC; + break; + + default: + BUG_ON(token); + } + } + + /* success */ + *popt = opt; + return 0; + +out: + ceph_destroy_options(opt); + return err; +} +EXPORT_SYMBOL(ceph_parse_options); + +u64 ceph_client_id(struct ceph_client *client) +{ + return client->monc.auth->global_id; +} +EXPORT_SYMBOL(ceph_client_id); + +/* + * create a fresh client instance + */ +struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private) +{ + struct ceph_client *client; + int err = -ENOMEM; + + client = kzalloc(sizeof(*client), GFP_KERNEL); + if (client == NULL) + return ERR_PTR(-ENOMEM); + + client->private = private; + client->options = opt; + + mutex_init(&client->mount_mutex); + init_waitqueue_head(&client->auth_wq); + client->auth_err = 0; + + client->extra_mon_dispatch = NULL; + client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT; + client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT; + + client->msgr = NULL; + + /* subsystems */ + err = ceph_monc_init(&client->monc, client); + if (err < 0) + goto fail; + err = ceph_osdc_init(&client->osdc, client); + if (err < 0) + goto fail_monc; + + return client; + +fail_monc: + ceph_monc_stop(&client->monc); +fail: + kfree(client); + return ERR_PTR(err); +} +EXPORT_SYMBOL(ceph_create_client); + +void ceph_destroy_client(struct ceph_client *client) +{ + dout("destroy_client %p\n", client); + + /* unmount */ + ceph_osdc_stop(&client->osdc); + + /* + * make sure mds and osd connections close out before destroying + * the auth module, which is needed to free those connections' + * ceph_authorizers. + */ + ceph_msgr_flush(); + + ceph_monc_stop(&client->monc); + + ceph_debugfs_client_cleanup(client); + + if (client->msgr) + ceph_messenger_destroy(client->msgr); + + ceph_destroy_options(client->options); + + kfree(client); + dout("destroy_client %p done\n", client); +} +EXPORT_SYMBOL(ceph_destroy_client); + +/* + * true if we have the mon map (and have thus joined the cluster) + */ +static int have_mon_and_osd_map(struct ceph_client *client) +{ + return client->monc.monmap && client->monc.monmap->epoch && + client->osdc.osdmap && client->osdc.osdmap->epoch; +} + +/* + * mount: join the ceph cluster, and open root directory. + */ +int __ceph_open_session(struct ceph_client *client, unsigned long started) +{ + struct ceph_entity_addr *myaddr = NULL; + int err; + unsigned long timeout = client->options->mount_timeout * HZ; + + /* initialize the messenger */ + if (client->msgr == NULL) { + if (ceph_test_opt(client, MYIP)) + myaddr = &client->options->my_addr; + client->msgr = ceph_messenger_create(myaddr, + client->supported_features, + client->required_features); + if (IS_ERR(client->msgr)) { + client->msgr = NULL; + return PTR_ERR(client->msgr); + } + client->msgr->nocrc = ceph_test_opt(client, NOCRC); + } + + /* open session, and wait for mon and osd maps */ + err = ceph_monc_open_session(&client->monc); + if (err < 0) + return err; + + while (!have_mon_and_osd_map(client)) { + err = -EIO; + if (timeout && time_after_eq(jiffies, started + timeout)) + return err; + + /* wait */ + dout("mount waiting for mon_map\n"); + err = wait_event_interruptible_timeout(client->auth_wq, + have_mon_and_osd_map(client) || (client->auth_err < 0), + timeout); + if (err == -EINTR || err == -ERESTARTSYS) + return err; + if (client->auth_err < 0) + return client->auth_err; + } + + return 0; +} +EXPORT_SYMBOL(__ceph_open_session); + + +int ceph_open_session(struct ceph_client *client) +{ + int ret; + unsigned long started = jiffies; /* note the start time */ + + dout("open_session start\n"); + mutex_lock(&client->mount_mutex); + + ret = __ceph_open_session(client, started); + + mutex_unlock(&client->mount_mutex); + return ret; +} +EXPORT_SYMBOL(ceph_open_session); + + +static int __init init_ceph_lib(void) +{ + int ret = 0; + + ret = ceph_debugfs_init(); + if (ret < 0) + goto out; + + ret = ceph_msgr_init(); + if (ret < 0) + goto out_debugfs; + + pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n", + CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL, + CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, + CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); + + return 0; + +out_debugfs: + ceph_debugfs_cleanup(); +out: + return ret; +} + +static void __exit exit_ceph_lib(void) +{ + dout("exit_ceph_lib\n"); + ceph_msgr_exit(); + ceph_debugfs_cleanup(); +} + +module_init(init_ceph_lib); +module_exit(exit_ceph_lib); + +MODULE_AUTHOR("Sage Weil "); +MODULE_AUTHOR("Yehuda Sadeh "); +MODULE_AUTHOR("Patience Warnick "); +MODULE_DESCRIPTION("Ceph filesystem for Linux"); +MODULE_LICENSE("GPL"); diff --git a/fs/ceph/ceph_fs.c b/net/ceph/ceph_fs.c similarity index 92% rename from fs/ceph/ceph_fs.c rename to net/ceph/ceph_fs.c index 3ac6cc7c1156..a3a3a31d3c37 100644 --- a/fs/ceph/ceph_fs.c +++ b/net/ceph/ceph_fs.c @@ -1,7 +1,8 @@ /* * Some non-inline ceph helpers */ -#include "types.h" +#include +#include /* * return true if @layout appears to be valid @@ -52,6 +53,7 @@ int ceph_flags_to_mode(int flags) return mode; } +EXPORT_SYMBOL(ceph_flags_to_mode); int ceph_caps_for_mode(int mode) { @@ -70,3 +72,4 @@ int ceph_caps_for_mode(int mode) return caps; } +EXPORT_SYMBOL(ceph_caps_for_mode); diff --git a/fs/ceph/ceph_hash.c b/net/ceph/ceph_hash.c similarity index 98% rename from fs/ceph/ceph_hash.c rename to net/ceph/ceph_hash.c index bd570015d147..815ef8826796 100644 --- a/fs/ceph/ceph_hash.c +++ b/net/ceph/ceph_hash.c @@ -1,5 +1,5 @@ -#include "types.h" +#include /* * Robert Jenkin's hash function. diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c new file mode 100644 index 000000000000..3fbda04de29c --- /dev/null +++ b/net/ceph/ceph_strings.c @@ -0,0 +1,84 @@ +/* + * Ceph string constants + */ +#include +#include + +const char *ceph_entity_type_name(int type) +{ + switch (type) { + case CEPH_ENTITY_TYPE_MDS: return "mds"; + case CEPH_ENTITY_TYPE_OSD: return "osd"; + case CEPH_ENTITY_TYPE_MON: return "mon"; + case CEPH_ENTITY_TYPE_CLIENT: return "client"; + case CEPH_ENTITY_TYPE_AUTH: return "auth"; + default: return "unknown"; + } +} + +const char *ceph_osd_op_name(int op) +{ + switch (op) { + case CEPH_OSD_OP_READ: return "read"; + case CEPH_OSD_OP_STAT: return "stat"; + + case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; + + case CEPH_OSD_OP_WRITE: return "write"; + case CEPH_OSD_OP_DELETE: return "delete"; + case CEPH_OSD_OP_TRUNCATE: return "truncate"; + case CEPH_OSD_OP_ZERO: return "zero"; + case CEPH_OSD_OP_WRITEFULL: return "writefull"; + case CEPH_OSD_OP_ROLLBACK: return "rollback"; + + case CEPH_OSD_OP_APPEND: return "append"; + case CEPH_OSD_OP_STARTSYNC: return "startsync"; + case CEPH_OSD_OP_SETTRUNC: return "settrunc"; + case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc"; + + case CEPH_OSD_OP_TMAPUP: return "tmapup"; + case CEPH_OSD_OP_TMAPGET: return "tmapget"; + case CEPH_OSD_OP_TMAPPUT: return "tmapput"; + + case CEPH_OSD_OP_GETXATTR: return "getxattr"; + case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; + case CEPH_OSD_OP_SETXATTR: return "setxattr"; + case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; + case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; + case CEPH_OSD_OP_RMXATTR: return "rmxattr"; + case CEPH_OSD_OP_CMPXATTR: return "cmpxattr"; + + case CEPH_OSD_OP_PULL: return "pull"; + case CEPH_OSD_OP_PUSH: return "push"; + case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; + case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; + case CEPH_OSD_OP_SCRUB: return "scrub"; + + case CEPH_OSD_OP_WRLOCK: return "wrlock"; + case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; + case CEPH_OSD_OP_RDLOCK: return "rdlock"; + case CEPH_OSD_OP_RDUNLOCK: return "rdunlock"; + case CEPH_OSD_OP_UPLOCK: return "uplock"; + case CEPH_OSD_OP_DNLOCK: return "dnlock"; + + case CEPH_OSD_OP_CALL: return "call"; + + case CEPH_OSD_OP_PGLS: return "pgls"; + } + return "???"; +} + + +const char *ceph_pool_op_name(int op) +{ + switch (op) { + case POOL_OP_CREATE: return "create"; + case POOL_OP_DELETE: return "delete"; + case POOL_OP_AUID_CHANGE: return "auid change"; + case POOL_OP_CREATE_SNAP: return "create snap"; + case POOL_OP_DELETE_SNAP: return "delete snap"; + case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; + case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; + } + return "???"; +} diff --git a/fs/ceph/crush/crush.c b/net/ceph/crush/crush.c similarity index 99% rename from fs/ceph/crush/crush.c rename to net/ceph/crush/crush.c index fabd302e5779..d6ebb13a18a4 100644 --- a/fs/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -8,7 +8,7 @@ # define BUG_ON(x) assert(!(x)) #endif -#include "crush.h" +#include const char *crush_bucket_alg_name(int alg) { diff --git a/fs/ceph/crush/hash.c b/net/ceph/crush/hash.c similarity index 99% rename from fs/ceph/crush/hash.c rename to net/ceph/crush/hash.c index 5873aed694bf..5bb63e37a8a1 100644 --- a/fs/ceph/crush/hash.c +++ b/net/ceph/crush/hash.c @@ -1,6 +1,6 @@ #include -#include "hash.h" +#include /* * Robert Jenkins' function for mixing 32-bit values diff --git a/fs/ceph/crush/mapper.c b/net/ceph/crush/mapper.c similarity index 99% rename from fs/ceph/crush/mapper.c rename to net/ceph/crush/mapper.c index a4eec133258e..42599e31dcad 100644 --- a/fs/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -18,8 +18,8 @@ # define kfree(x) free(x) #endif -#include "crush.h" -#include "hash.h" +#include +#include /* * Implement the core CRUSH mapping algorithm. diff --git a/fs/ceph/crypto.c b/net/ceph/crypto.c similarity index 99% rename from fs/ceph/crypto.c rename to net/ceph/crypto.c index a3e627f63293..7b505b0c983f 100644 --- a/fs/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -1,13 +1,13 @@ -#include "ceph_debug.h" +#include #include #include #include #include +#include #include "crypto.h" -#include "decode.h" int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end) { diff --git a/fs/ceph/crypto.h b/net/ceph/crypto.h similarity index 95% rename from fs/ceph/crypto.h rename to net/ceph/crypto.h index bdf38607323c..f9eccace592b 100644 --- a/fs/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -1,8 +1,8 @@ #ifndef _FS_CEPH_CRYPTO_H #define _FS_CEPH_CRYPTO_H -#include "types.h" -#include "buffer.h" +#include +#include /* * cryptographic secret diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c new file mode 100644 index 000000000000..27d4ea315d12 --- /dev/null +++ b/net/ceph/debugfs.c @@ -0,0 +1,267 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef CONFIG_DEBUG_FS + +/* + * Implement /sys/kernel/debug/ceph fun + * + * /sys/kernel/debug/ceph/client* - an instance of the ceph client + * .../osdmap - current osdmap + * .../monmap - current monmap + * .../osdc - active osd requests + * .../monc - mon client state + * .../dentry_lru - dump contents of dentry lru + * .../caps - expose cap (reservation) stats + * .../bdi - symlink to ../../bdi/something + */ + +static struct dentry *ceph_debugfs_dir; + +static int monmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_client *client = s->private; + + if (client->monc.monmap == NULL) + return 0; + + seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); + for (i = 0; i < client->monc.monmap->num_mon; i++) { + struct ceph_entity_inst *inst = + &client->monc.monmap->mon_inst[i]; + + seq_printf(s, "\t%s%lld\t%s\n", + ENTITY_NAME(inst->name), + ceph_pr_addr(&inst->addr.in_addr)); + } + return 0; +} + +static int osdmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_client *client = s->private; + struct rb_node *n; + + if (client->osdc.osdmap == NULL) + return 0; + seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); + seq_printf(s, "flags%s%s\n", + (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? + " NEARFULL" : "", + (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? + " FULL" : ""); + for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { + struct ceph_pg_pool_info *pool = + rb_entry(n, struct ceph_pg_pool_info, node); + seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", + pool->id, pool->v.pg_num, pool->pg_num_mask, + pool->v.lpg_num, pool->lpg_num_mask); + } + for (i = 0; i < client->osdc.osdmap->max_osd; i++) { + struct ceph_entity_addr *addr = + &client->osdc.osdmap->osd_addr[i]; + int state = client->osdc.osdmap->osd_state[i]; + char sb[64]; + + seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", + i, ceph_pr_addr(&addr->in_addr), + ((client->osdc.osdmap->osd_weight[i]*100) >> 16), + ceph_osdmap_state_str(sb, sizeof(sb), state)); + } + return 0; +} + +static int monc_show(struct seq_file *s, void *p) +{ + struct ceph_client *client = s->private; + struct ceph_mon_generic_request *req; + struct ceph_mon_client *monc = &client->monc; + struct rb_node *rp; + + mutex_lock(&monc->mutex); + + if (monc->have_mdsmap) + seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap); + if (monc->have_osdmap) + seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap); + if (monc->want_next_osdmap) + seq_printf(s, "want next osdmap\n"); + + for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { + __u16 op; + req = rb_entry(rp, struct ceph_mon_generic_request, node); + op = le16_to_cpu(req->request->hdr.type); + if (op == CEPH_MSG_STATFS) + seq_printf(s, "%lld statfs\n", req->tid); + else + seq_printf(s, "%lld unknown\n", req->tid); + } + + mutex_unlock(&monc->mutex); + return 0; +} + +static int osdc_show(struct seq_file *s, void *pp) +{ + struct ceph_client *client = s->private; + struct ceph_osd_client *osdc = &client->osdc; + struct rb_node *p; + + mutex_lock(&osdc->request_mutex); + for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { + struct ceph_osd_request *req; + struct ceph_osd_request_head *head; + struct ceph_osd_op *op; + int num_ops; + int opcode, olen; + int i; + + req = rb_entry(p, struct ceph_osd_request, r_node); + + seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1, + le32_to_cpu(req->r_pgid.pool), + le16_to_cpu(req->r_pgid.ps)); + + head = req->r_request->front.iov_base; + op = (void *)(head + 1); + + num_ops = le16_to_cpu(head->num_ops); + olen = le32_to_cpu(head->object_len); + seq_printf(s, "%.*s", olen, + (const char *)(head->ops + num_ops)); + + if (req->r_reassert_version.epoch) + seq_printf(s, "\t%u'%llu", + (unsigned)le32_to_cpu(req->r_reassert_version.epoch), + le64_to_cpu(req->r_reassert_version.version)); + else + seq_printf(s, "\t"); + + for (i = 0; i < num_ops; i++) { + opcode = le16_to_cpu(op->op); + seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); + op++; + } + + seq_printf(s, "\n"); + } + mutex_unlock(&osdc->request_mutex); + return 0; +} + +CEPH_DEFINE_SHOW_FUNC(monmap_show) +CEPH_DEFINE_SHOW_FUNC(osdmap_show) +CEPH_DEFINE_SHOW_FUNC(monc_show) +CEPH_DEFINE_SHOW_FUNC(osdc_show) + +int ceph_debugfs_init(void) +{ + ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); + if (!ceph_debugfs_dir) + return -ENOMEM; + return 0; +} + +void ceph_debugfs_cleanup(void) +{ + debugfs_remove(ceph_debugfs_dir); +} + +int ceph_debugfs_client_init(struct ceph_client *client) +{ + int ret = -ENOMEM; + char name[80]; + + snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, + client->monc.auth->global_id); + + client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); + if (!client->debugfs_dir) + goto out; + + client->monc.debugfs_file = debugfs_create_file("monc", + 0600, + client->debugfs_dir, + client, + &monc_show_fops); + if (!client->monc.debugfs_file) + goto out; + + client->osdc.debugfs_file = debugfs_create_file("osdc", + 0600, + client->debugfs_dir, + client, + &osdc_show_fops); + if (!client->osdc.debugfs_file) + goto out; + + client->debugfs_monmap = debugfs_create_file("monmap", + 0600, + client->debugfs_dir, + client, + &monmap_show_fops); + if (!client->debugfs_monmap) + goto out; + + client->debugfs_osdmap = debugfs_create_file("osdmap", + 0600, + client->debugfs_dir, + client, + &osdmap_show_fops); + if (!client->debugfs_osdmap) + goto out; + + return 0; + +out: + ceph_debugfs_client_cleanup(client); + return ret; +} + +void ceph_debugfs_client_cleanup(struct ceph_client *client) +{ + debugfs_remove(client->debugfs_osdmap); + debugfs_remove(client->debugfs_monmap); + debugfs_remove(client->osdc.debugfs_file); + debugfs_remove(client->monc.debugfs_file); + debugfs_remove(client->debugfs_dir); +} + +#else /* CONFIG_DEBUG_FS */ + +int ceph_debugfs_init(void) +{ + return 0; +} + +void ceph_debugfs_cleanup(void) +{ +} + +int ceph_debugfs_client_init(struct ceph_client *client) +{ + return 0; +} + +void ceph_debugfs_client_cleanup(struct ceph_client *client) +{ +} + +#endif /* CONFIG_DEBUG_FS */ + +EXPORT_SYMBOL(ceph_debugfs_init); +EXPORT_SYMBOL(ceph_debugfs_cleanup); diff --git a/fs/ceph/messenger.c b/net/ceph/messenger.c similarity index 89% rename from fs/ceph/messenger.c rename to net/ceph/messenger.c index 2502d76fcec1..0e8157ee5d43 100644 --- a/fs/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include #include #include @@ -9,12 +9,14 @@ #include #include #include +#include +#include #include -#include "super.h" -#include "messenger.h" -#include "decode.h" -#include "pagelist.h" +#include +#include +#include +#include /* * Ceph uses the messenger to exchange ceph_msg messages with other @@ -48,7 +50,7 @@ static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; static DEFINE_SPINLOCK(addr_str_lock); static int last_addr_str; -const char *pr_addr(const struct sockaddr_storage *ss) +const char *ceph_pr_addr(const struct sockaddr_storage *ss) { int i; char *s; @@ -79,6 +81,7 @@ const char *pr_addr(const struct sockaddr_storage *ss) return s; } +EXPORT_SYMBOL(ceph_pr_addr); static void encode_my_addr(struct ceph_messenger *msgr) { @@ -91,7 +94,7 @@ static void encode_my_addr(struct ceph_messenger *msgr) */ struct workqueue_struct *ceph_msgr_wq; -int __init ceph_msgr_init(void) +int ceph_msgr_init(void) { ceph_msgr_wq = create_workqueue("ceph-msgr"); if (IS_ERR(ceph_msgr_wq)) { @@ -102,16 +105,19 @@ int __init ceph_msgr_init(void) } return 0; } +EXPORT_SYMBOL(ceph_msgr_init); void ceph_msgr_exit(void) { destroy_workqueue(ceph_msgr_wq); } +EXPORT_SYMBOL(ceph_msgr_exit); void ceph_msgr_flush(void) { flush_workqueue(ceph_msgr_wq); } +EXPORT_SYMBOL(ceph_msgr_flush); /* @@ -221,19 +227,19 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) set_sock_callbacks(sock, con); - dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); + dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), sock->sk->sk_state); ret = 0; } if (ret < 0) { pr_err("connect %s error %d\n", - pr_addr(&con->peer_addr.in_addr), ret); + ceph_pr_addr(&con->peer_addr.in_addr), ret); sock_release(sock); con->sock = NULL; con->error_msg = "connect error"; @@ -334,7 +340,8 @@ static void reset_connection(struct ceph_connection *con) */ void ceph_con_close(struct ceph_connection *con) { - dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr)); + dout("con_close %p peer %s\n", con, + ceph_pr_addr(&con->peer_addr.in_addr)); set_bit(CLOSED, &con->state); /* in case there's queued work */ clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ @@ -347,19 +354,21 @@ void ceph_con_close(struct ceph_connection *con) mutex_unlock(&con->mutex); queue_con(con); } +EXPORT_SYMBOL(ceph_con_close); /* * Reopen a closed connection, with a new peer address. */ void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) { - dout("con_open %p %s\n", con, pr_addr(&addr->in_addr)); + dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); set_bit(OPENING, &con->state); clear_bit(CLOSED, &con->state); memcpy(&con->peer_addr, addr, sizeof(*addr)); con->delay = 0; /* reset backoff memory */ queue_con(con); } +EXPORT_SYMBOL(ceph_con_open); /* * return true if this connection ever successfully opened @@ -406,6 +415,7 @@ void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) INIT_LIST_HEAD(&con->out_sent); INIT_DELAYED_WORK(&con->work, con_work); } +EXPORT_SYMBOL(ceph_con_init); /* @@ -529,8 +539,11 @@ static void prepare_write_message(struct ceph_connection *con) if (le32_to_cpu(m->hdr.data_len) > 0) { /* initialize page iterator */ con->out_msg_pos.page = 0; - con->out_msg_pos.page_pos = - le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK; + if (m->pages) + con->out_msg_pos.page_pos = + le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK; + else + con->out_msg_pos.page_pos = 0; con->out_msg_pos.data_pos = 0; con->out_msg_pos.did_page_crc = 0; con->out_more = 1; /* data + footer will follow */ @@ -647,7 +660,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); - con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED); + con->out_connect.features = cpu_to_le64(msgr->supported_features); con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -712,6 +725,31 @@ out: return ret; /* done! */ } +#ifdef CONFIG_BLOCK +static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) +{ + if (!bio) { + *iter = NULL; + *seg = 0; + return; + } + *iter = bio; + *seg = bio->bi_idx; +} + +static void iter_bio_next(struct bio **bio_iter, int *seg) +{ + if (*bio_iter == NULL) + return; + + BUG_ON(*seg >= (*bio_iter)->bi_vcnt); + + (*seg)++; + if (*seg == (*bio_iter)->bi_vcnt) + init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); +} +#endif + /* * Write as much message data payload as we can. If we finish, queue * up the footer. @@ -726,21 +764,46 @@ static int write_partial_msg_pages(struct ceph_connection *con) size_t len; int crc = con->msgr->nocrc; int ret; + int total_max_write; + int in_trail = 0; + size_t trail_len = (msg->trail ? msg->trail->length : 0); dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, con->out_msg_pos.page_pos); - while (con->out_msg_pos.page < con->out_msg->nr_pages) { +#ifdef CONFIG_BLOCK + if (msg->bio && !msg->bio_iter) + init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); +#endif + + while (data_len > con->out_msg_pos.data_pos) { struct page *page = NULL; void *kaddr = NULL; + int max_write = PAGE_SIZE; + int page_shift = 0; + + total_max_write = data_len - trail_len - + con->out_msg_pos.data_pos; /* * if we are calculating the data crc (the default), we need * to map the page. if our pages[] has been revoked, use the * zero page. */ - if (msg->pages) { + + /* have we reached the trail part of the data? */ + if (con->out_msg_pos.data_pos >= data_len - trail_len) { + in_trail = 1; + + total_max_write = data_len - con->out_msg_pos.data_pos; + + page = list_first_entry(&msg->trail->head, + struct page, lru); + if (crc) + kaddr = kmap(page); + max_write = PAGE_SIZE; + } else if (msg->pages) { page = msg->pages[con->out_msg_pos.page]; if (crc) kaddr = kmap(page); @@ -749,13 +812,25 @@ static int write_partial_msg_pages(struct ceph_connection *con) struct page, lru); if (crc) kaddr = kmap(page); +#ifdef CONFIG_BLOCK + } else if (msg->bio) { + struct bio_vec *bv; + + bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); + page = bv->bv_page; + page_shift = bv->bv_offset; + if (crc) + kaddr = kmap(page) + page_shift; + max_write = bv->bv_len; +#endif } else { page = con->msgr->zero_page; if (crc) kaddr = page_address(con->msgr->zero_page); } - len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos), - (int)(data_len - con->out_msg_pos.data_pos)); + len = min_t(int, max_write - con->out_msg_pos.page_pos, + total_max_write); + if (crc && !con->out_msg_pos.did_page_crc) { void *base = kaddr + con->out_msg_pos.page_pos; u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); @@ -765,13 +840,14 @@ static int write_partial_msg_pages(struct ceph_connection *con) cpu_to_le32(crc32c(tmpcrc, base, len)); con->out_msg_pos.did_page_crc = 1; } - ret = kernel_sendpage(con->sock, page, - con->out_msg_pos.page_pos, len, + con->out_msg_pos.page_pos + page_shift, + len, MSG_DONTWAIT | MSG_NOSIGNAL | MSG_MORE); - if (crc && (msg->pages || msg->pagelist)) + if (crc && + (msg->pages || msg->pagelist || msg->bio || in_trail)) kunmap(page); if (ret <= 0) @@ -783,9 +859,16 @@ static int write_partial_msg_pages(struct ceph_connection *con) con->out_msg_pos.page_pos = 0; con->out_msg_pos.page++; con->out_msg_pos.did_page_crc = 0; - if (msg->pagelist) + if (in_trail) + list_move_tail(&page->lru, + &msg->trail->head); + else if (msg->pagelist) list_move_tail(&page->lru, &msg->pagelist->head); +#ifdef CONFIG_BLOCK + else if (msg->bio) + iter_bio_next(&msg->bio_iter, &msg->bio_seg); +#endif } } @@ -938,7 +1021,7 @@ static int verify_hello(struct ceph_connection *con) { if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { pr_err("connect to %s got bad banner\n", - pr_addr(&con->peer_addr.in_addr)); + ceph_pr_addr(&con->peer_addr.in_addr)); con->error_msg = "protocol error, bad banner"; return -1; } @@ -1041,7 +1124,7 @@ int ceph_parse_ips(const char *c, const char *end, addr_set_port(ss, port); - dout("parse_ips got %s\n", pr_addr(ss)); + dout("parse_ips got %s\n", ceph_pr_addr(ss)); if (p == end) break; @@ -1061,6 +1144,7 @@ bad: pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); return -EINVAL; } +EXPORT_SYMBOL(ceph_parse_ips); static int process_banner(struct ceph_connection *con) { @@ -1082,9 +1166,9 @@ static int process_banner(struct ceph_connection *con) !(addr_is_blank(&con->actual_peer_addr.in_addr) && con->actual_peer_addr.nonce == con->peer_addr.nonce)) { pr_warning("wrong peer, want %s/%d, got %s/%d\n", - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), (int)le32_to_cpu(con->peer_addr.nonce), - pr_addr(&con->actual_peer_addr.in_addr), + ceph_pr_addr(&con->actual_peer_addr.in_addr), (int)le32_to_cpu(con->actual_peer_addr.nonce)); con->error_msg = "wrong peer at address"; return -1; @@ -1102,7 +1186,7 @@ static int process_banner(struct ceph_connection *con) addr_set_port(&con->msgr->inst.addr.in_addr, port); encode_my_addr(con->msgr); dout("process_banner learned my addr is %s\n", - pr_addr(&con->msgr->inst.addr.in_addr)); + ceph_pr_addr(&con->msgr->inst.addr.in_addr)); } set_bit(NEGOTIATING, &con->state); @@ -1123,8 +1207,8 @@ static void fail_protocol(struct ceph_connection *con) static int process_connect(struct ceph_connection *con) { - u64 sup_feat = CEPH_FEATURE_SUPPORTED; - u64 req_feat = CEPH_FEATURE_REQUIRED; + u64 sup_feat = con->msgr->supported_features; + u64 req_feat = con->msgr->required_features; u64 server_feat = le64_to_cpu(con->in_reply.features); dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@ -1134,7 +1218,7 @@ static int process_connect(struct ceph_connection *con) pr_err("%s%lld %s feature set mismatch," " my %llx < server's %llx, missing %llx\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), sup_feat, server_feat, server_feat & ~sup_feat); con->error_msg = "missing required protocol features"; fail_protocol(con); @@ -1144,7 +1228,7 @@ static int process_connect(struct ceph_connection *con) pr_err("%s%lld %s protocol version mismatch," " my %d != server's %d\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), le32_to_cpu(con->out_connect.protocol_version), le32_to_cpu(con->in_reply.protocol_version)); con->error_msg = "protocol version mismatch"; @@ -1178,7 +1262,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_connect.connect_seq)); pr_err("%s%lld %s connection reset\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr)); + ceph_pr_addr(&con->peer_addr.in_addr)); reset_connection(con); prepare_write_connect(con->msgr, con, 0); prepare_read_connect(con); @@ -1223,7 +1307,7 @@ static int process_connect(struct ceph_connection *con) pr_err("%s%lld %s protocol feature mismatch," " my required %llx > server's %llx, need %llx\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), req_feat, server_feat, req_feat & ~server_feat); con->error_msg = "missing required protocol features"; fail_protocol(con); @@ -1305,8 +1389,7 @@ static int read_partial_message_section(struct ceph_connection *con, struct kvec *section, unsigned int sec_len, u32 *crc) { - int left; - int ret; + int ret, left; BUG_ON(!section); @@ -1329,13 +1412,83 @@ static int read_partial_message_section(struct ceph_connection *con, static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip); + + +static int read_partial_message_pages(struct ceph_connection *con, + struct page **pages, + unsigned data_len, int datacrc) +{ + void *p; + int ret; + int left; + + left = min((int)(data_len - con->in_msg_pos.data_pos), + (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); + /* (page) data */ + BUG_ON(pages == NULL); + p = kmap(pages[con->in_msg_pos.page]); + ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, + left); + if (ret > 0 && datacrc) + con->in_data_crc = + crc32c(con->in_data_crc, + p + con->in_msg_pos.page_pos, ret); + kunmap(pages[con->in_msg_pos.page]); + if (ret <= 0) + return ret; + con->in_msg_pos.data_pos += ret; + con->in_msg_pos.page_pos += ret; + if (con->in_msg_pos.page_pos == PAGE_SIZE) { + con->in_msg_pos.page_pos = 0; + con->in_msg_pos.page++; + } + + return ret; +} + +#ifdef CONFIG_BLOCK +static int read_partial_message_bio(struct ceph_connection *con, + struct bio **bio_iter, int *bio_seg, + unsigned data_len, int datacrc) +{ + struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg); + void *p; + int ret, left; + + if (IS_ERR(bv)) + return PTR_ERR(bv); + + left = min((int)(data_len - con->in_msg_pos.data_pos), + (int)(bv->bv_len - con->in_msg_pos.page_pos)); + + p = kmap(bv->bv_page) + bv->bv_offset; + + ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, + left); + if (ret > 0 && datacrc) + con->in_data_crc = + crc32c(con->in_data_crc, + p + con->in_msg_pos.page_pos, ret); + kunmap(bv->bv_page); + if (ret <= 0) + return ret; + con->in_msg_pos.data_pos += ret; + con->in_msg_pos.page_pos += ret; + if (con->in_msg_pos.page_pos == bv->bv_len) { + con->in_msg_pos.page_pos = 0; + iter_bio_next(bio_iter, bio_seg); + } + + return ret; +} +#endif + /* * read (part of) a message. */ static int read_partial_message(struct ceph_connection *con) { struct ceph_msg *m = con->in_msg; - void *p; int ret; int to, left; unsigned front_len, middle_len, data_len, data_off; @@ -1381,7 +1534,7 @@ static int read_partial_message(struct ceph_connection *con) if ((s64)seq - (s64)con->in_seq < 1) { pr_info("skipping %s%lld %s seq %lld, expected %lld\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), seq, con->in_seq + 1); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); @@ -1422,7 +1575,10 @@ static int read_partial_message(struct ceph_connection *con) m->middle->vec.iov_len = 0; con->in_msg_pos.page = 0; - con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; + if (m->pages) + con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; + else + con->in_msg_pos.page_pos = 0; con->in_msg_pos.data_pos = 0; } @@ -1440,27 +1596,29 @@ static int read_partial_message(struct ceph_connection *con) if (ret <= 0) return ret; } +#ifdef CONFIG_BLOCK + if (m->bio && !m->bio_iter) + init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); +#endif /* (page) data */ while (con->in_msg_pos.data_pos < data_len) { - left = min((int)(data_len - con->in_msg_pos.data_pos), - (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); - BUG_ON(m->pages == NULL); - p = kmap(m->pages[con->in_msg_pos.page]); - ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, - left); - if (ret > 0 && datacrc) - con->in_data_crc = - crc32c(con->in_data_crc, - p + con->in_msg_pos.page_pos, ret); - kunmap(m->pages[con->in_msg_pos.page]); - if (ret <= 0) - return ret; - con->in_msg_pos.data_pos += ret; - con->in_msg_pos.page_pos += ret; - if (con->in_msg_pos.page_pos == PAGE_SIZE) { - con->in_msg_pos.page_pos = 0; - con->in_msg_pos.page++; + if (m->pages) { + ret = read_partial_message_pages(con, m->pages, + data_len, datacrc); + if (ret <= 0) + return ret; +#ifdef CONFIG_BLOCK + } else if (m->bio) { + + ret = read_partial_message_bio(con, + &m->bio_iter, &m->bio_seg, + data_len, datacrc); + if (ret <= 0) + return ret; +#endif + } else { + BUG_ON(1); } } @@ -1874,9 +2032,9 @@ out: static void ceph_fault(struct ceph_connection *con) { pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), con->error_msg); + ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); dout("fault %p state %lu to peer %s\n", - con, con->state, pr_addr(&con->peer_addr.in_addr)); + con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); if (test_bit(LOSSYTX, &con->state)) { dout("fault on LOSSYTX channel\n"); @@ -1936,7 +2094,9 @@ out: /* * create a new messenger instance */ -struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) +struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, + u32 supported_features, + u32 required_features) { struct ceph_messenger *msgr; @@ -1944,6 +2104,9 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) if (msgr == NULL) return ERR_PTR(-ENOMEM); + msgr->supported_features = supported_features; + msgr->required_features = required_features; + spin_lock_init(&msgr->global_seq_lock); /* the zero page is needed if a request is "canceled" while the message @@ -1966,6 +2129,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) dout("messenger_create %p\n", msgr); return msgr; } +EXPORT_SYMBOL(ceph_messenger_create); void ceph_messenger_destroy(struct ceph_messenger *msgr) { @@ -1975,6 +2139,7 @@ void ceph_messenger_destroy(struct ceph_messenger *msgr) kfree(msgr); dout("destroyed messenger %p\n", msgr); } +EXPORT_SYMBOL(ceph_messenger_destroy); /* * Queue up an outgoing message on the given connection. @@ -2011,6 +2176,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) queue_con(con); } +EXPORT_SYMBOL(ceph_con_send); /* * Revoke a message that was previously queued for send @@ -2076,6 +2242,7 @@ void ceph_con_keepalive(struct ceph_connection *con) test_and_set_bit(WRITE_PENDING, &con->state) == 0) queue_con(con); } +EXPORT_SYMBOL(ceph_con_keepalive); /* @@ -2136,6 +2303,10 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) m->nr_pages = 0; m->pages = NULL; m->pagelist = NULL; + m->bio = NULL; + m->bio_iter = NULL; + m->bio_seg = 0; + m->trail = NULL; dout("ceph_msg_new %p front %d\n", m, front_len); return m; @@ -2146,6 +2317,7 @@ out: pr_err("msg_new can't create type %d front %d\n", type, front_len); return NULL; } +EXPORT_SYMBOL(ceph_msg_new); /* * Allocate "middle" portion of a message, if it is needed and wasn't @@ -2250,11 +2422,14 @@ void ceph_msg_last_put(struct kref *kref) m->pagelist = NULL; } + m->trail = NULL; + if (m->pool) ceph_msgpool_put(m->pool, m); else ceph_msg_kfree(m); } +EXPORT_SYMBOL(ceph_msg_last_put); void ceph_msg_dump(struct ceph_msg *msg) { @@ -2275,3 +2450,4 @@ void ceph_msg_dump(struct ceph_msg *msg) DUMP_PREFIX_OFFSET, 16, 1, &msg->footer, sizeof(msg->footer), true); } +EXPORT_SYMBOL(ceph_msg_dump); diff --git a/fs/ceph/mon_client.c b/net/ceph/mon_client.c similarity index 94% rename from fs/ceph/mon_client.c rename to net/ceph/mon_client.c index b2a5a3e4a671..8a079399174a 100644 --- a/fs/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1,14 +1,16 @@ -#include "ceph_debug.h" +#include +#include #include #include #include #include -#include "mon_client.h" -#include "super.h" -#include "auth.h" -#include "decode.h" +#include +#include +#include + +#include /* * Interact with Ceph monitor cluster. Handle requests for new map @@ -74,7 +76,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end) m->num_mon); for (i = 0; i < m->num_mon; i++) dout("monmap_decode mon%d is %s\n", i, - pr_addr(&m->mon_inst[i].addr.in_addr)); + ceph_pr_addr(&m->mon_inst[i].addr.in_addr)); return m; bad: @@ -191,30 +193,33 @@ static void __send_subscribe(struct ceph_mon_client *monc) struct ceph_msg *msg = monc->m_subscribe; struct ceph_mon_subscribe_item *i; void *p, *end; + int num; p = msg->front.iov_base; end = p + msg->front_max; - dout("__send_subscribe to 'mdsmap' %u+\n", - (unsigned)monc->have_mdsmap); + num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; + ceph_encode_32(&p, num); + if (monc->want_next_osdmap) { dout("__send_subscribe to 'osdmap' %u\n", (unsigned)monc->have_osdmap); - ceph_encode_32(&p, 3); ceph_encode_string(&p, end, "osdmap", 6); i = p; i->have = cpu_to_le64(monc->have_osdmap); i->onetime = 1; p += sizeof(*i); monc->want_next_osdmap = 2; /* requested */ - } else { - ceph_encode_32(&p, 2); } - ceph_encode_string(&p, end, "mdsmap", 6); - i = p; - i->have = cpu_to_le64(monc->have_mdsmap); - i->onetime = 0; - p += sizeof(*i); + if (monc->want_mdsmap) { + dout("__send_subscribe to 'mdsmap' %u+\n", + (unsigned)monc->have_mdsmap); + ceph_encode_string(&p, end, "mdsmap", 6); + i = p; + i->have = cpu_to_le64(monc->have_mdsmap); + i->onetime = 0; + p += sizeof(*i); + } ceph_encode_string(&p, end, "monmap", 6); i = p; i->have = 0; @@ -243,7 +248,8 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, mutex_lock(&monc->mutex); if (monc->hunting) { pr_info("mon%d %s session established\n", - monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr)); + monc->cur_mon, + ceph_pr_addr(&monc->con->peer_addr.in_addr)); monc->hunting = false; } dout("handle_subscribe_ack after %d seconds\n", seconds); @@ -266,6 +272,7 @@ int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) mutex_unlock(&monc->mutex); return 0; } +EXPORT_SYMBOL(ceph_monc_got_mdsmap); int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) { @@ -310,6 +317,7 @@ int ceph_monc_open_session(struct ceph_mon_client *monc) mutex_unlock(&monc->mutex); return 0; } +EXPORT_SYMBOL(ceph_monc_open_session); /* * The monitor responds with mount ack indicate mount success. The @@ -540,6 +548,7 @@ out: kref_put(&req->kref, release_generic_request); return err; } +EXPORT_SYMBOL(ceph_monc_do_statfs); /* * pool ops @@ -651,6 +660,7 @@ int ceph_monc_create_snapid(struct ceph_mon_client *monc, pool, 0, (char *)snapid, sizeof(*snapid)); } +EXPORT_SYMBOL(ceph_monc_create_snapid); int ceph_monc_delete_snapid(struct ceph_mon_client *monc, u32 pool, u64 snapid) @@ -708,9 +718,9 @@ static void delayed_work(struct work_struct *work) */ static int build_initial_monmap(struct ceph_mon_client *monc) { - struct ceph_mount_args *args = monc->client->mount_args; - struct ceph_entity_addr *mon_addr = args->mon_addr; - int num_mon = args->num_mon; + struct ceph_options *opt = monc->client->options; + struct ceph_entity_addr *mon_addr = opt->mon_addr; + int num_mon = opt->num_mon; int i; /* build initial monmap */ @@ -728,11 +738,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc) } monc->monmap->num_mon = num_mon; monc->have_fsid = false; - - /* release addr memory */ - kfree(args->mon_addr); - args->mon_addr = NULL; - args->num_mon = 0; return 0; } @@ -753,8 +758,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->con = NULL; /* authentication */ - monc->auth = ceph_auth_init(cl->mount_args->name, - cl->mount_args->secret); + monc->auth = ceph_auth_init(cl->options->name, + cl->options->secret); if (IS_ERR(monc->auth)) return PTR_ERR(monc->auth); monc->auth->want_keys = @@ -808,6 +813,7 @@ out_monmap: out: return err; } +EXPORT_SYMBOL(ceph_monc_init); void ceph_monc_stop(struct ceph_mon_client *monc) { @@ -832,6 +838,7 @@ void ceph_monc_stop(struct ceph_mon_client *monc) kfree(monc->monmap); } +EXPORT_SYMBOL(ceph_monc_stop); static void handle_auth_reply(struct ceph_mon_client *monc, struct ceph_msg *msg) @@ -889,6 +896,7 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc) mutex_unlock(&monc->mutex); return ret; } +EXPORT_SYMBOL(ceph_monc_validate_auth); /* * handle incoming message @@ -922,15 +930,16 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) ceph_monc_handle_map(monc, msg); break; - case CEPH_MSG_MDS_MAP: - ceph_mdsc_handle_map(&monc->client->mdsc, msg); - break; - case CEPH_MSG_OSD_MAP: ceph_osdc_handle_map(&monc->client->osdc, msg); break; default: + /* can the chained handler handle it? */ + if (monc->client->extra_mon_dispatch && + monc->client->extra_mon_dispatch(monc->client, msg) == 0) + break; + pr_err("received unknown message type %d %s\n", type, ceph_msg_type_name(type)); } @@ -994,7 +1003,7 @@ static void mon_fault(struct ceph_connection *con) if (monc->con && !monc->hunting) pr_info("mon%d %s session lost, " "hunting for new mon\n", monc->cur_mon, - pr_addr(&monc->con->peer_addr.in_addr)); + ceph_pr_addr(&monc->con->peer_addr.in_addr)); __close_session(monc); if (!monc->hunting) { diff --git a/fs/ceph/msgpool.c b/net/ceph/msgpool.c similarity index 95% rename from fs/ceph/msgpool.c rename to net/ceph/msgpool.c index dd65a6438131..d5f2d97ac05c 100644 --- a/fs/ceph/msgpool.c +++ b/net/ceph/msgpool.c @@ -1,11 +1,11 @@ -#include "ceph_debug.h" +#include #include #include #include #include -#include "msgpool.h" +#include static void *alloc_fn(gfp_t gfp_mask, void *arg) { diff --git a/fs/ceph/osd_client.c b/net/ceph/osd_client.c similarity index 84% rename from fs/ceph/osd_client.c rename to net/ceph/osd_client.c index 3b5571b8ce22..79391994b3ed 100644 --- a/fs/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1,17 +1,22 @@ -#include "ceph_debug.h" +#include +#include #include #include #include #include #include #include +#ifdef CONFIG_BLOCK +#include +#endif -#include "super.h" -#include "osd_client.h" -#include "messenger.h" -#include "decode.h" -#include "auth.h" +#include +#include +#include +#include +#include +#include #define OSD_OP_FRONT_LEN 4096 #define OSD_OPREPLY_FRONT_LEN 512 @@ -22,6 +27,59 @@ static int __kick_requests(struct ceph_osd_client *osdc, static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); +static int op_needs_trail(int op) +{ + switch (op) { + case CEPH_OSD_OP_GETXATTR: + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_CMPXATTR: + case CEPH_OSD_OP_CALL: + return 1; + default: + return 0; + } +} + +static int op_has_extent(int op) +{ + return (op == CEPH_OSD_OP_READ || + op == CEPH_OSD_OP_WRITE); +} + +void ceph_calc_raw_layout(struct ceph_osd_client *osdc, + struct ceph_file_layout *layout, + u64 snapid, + u64 off, u64 *plen, u64 *bno, + struct ceph_osd_request *req, + struct ceph_osd_req_op *op) +{ + struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; + u64 orig_len = *plen; + u64 objoff, objlen; /* extent in object */ + + reqhead->snapid = cpu_to_le64(snapid); + + /* object extent? */ + ceph_calc_file_object_mapping(layout, off, plen, bno, + &objoff, &objlen); + if (*plen < orig_len) + dout(" skipping last %llu, final file extent %llu~%llu\n", + orig_len - *plen, off, *plen); + + if (op_has_extent(op->op)) { + op->extent.offset = objoff; + op->extent.length = objlen; + } + req->r_num_pages = calc_pages_for(off, *plen); + if (op->op == CEPH_OSD_OP_WRITE) + op->payload_len = *plen; + + dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", + *bno, objoff, objlen, req->r_num_pages); + +} +EXPORT_SYMBOL(ceph_calc_raw_layout); + /* * Implement client access to distributed object storage cluster. * @@ -48,34 +106,19 @@ static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); * fill osd op in request message. */ static void calc_layout(struct ceph_osd_client *osdc, - struct ceph_vino vino, struct ceph_file_layout *layout, + struct ceph_vino vino, + struct ceph_file_layout *layout, u64 off, u64 *plen, - struct ceph_osd_request *req) + struct ceph_osd_request *req, + struct ceph_osd_req_op *op) { - struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; - struct ceph_osd_op *op = (void *)(reqhead + 1); - u64 orig_len = *plen; - u64 objoff, objlen; /* extent in object */ u64 bno; - reqhead->snapid = cpu_to_le64(vino.snap); - - /* object extent? */ - ceph_calc_file_object_mapping(layout, off, plen, &bno, - &objoff, &objlen); - if (*plen < orig_len) - dout(" skipping last %llu, final file extent %llu~%llu\n", - orig_len - *plen, off, *plen); + ceph_calc_raw_layout(osdc, layout, vino.snap, off, + plen, &bno, req, op); sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno); req->r_oid_len = strlen(req->r_oid); - - op->extent.offset = cpu_to_le64(objoff); - op->extent.length = cpu_to_le64(objlen); - req->r_num_pages = calc_pages_for(off, *plen); - - dout("calc_layout %s (%d) %llu~%llu (%d pages)\n", - req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages); } /* @@ -101,12 +144,259 @@ void ceph_osdc_release_request(struct kref *kref) if (req->r_own_pages) ceph_release_page_vector(req->r_pages, req->r_num_pages); +#ifdef CONFIG_BLOCK + if (req->r_bio) + bio_put(req->r_bio); +#endif ceph_put_snap_context(req->r_snapc); + if (req->r_trail) { + ceph_pagelist_release(req->r_trail); + kfree(req->r_trail); + } if (req->r_mempool) mempool_free(req, req->r_osdc->req_mempool); else kfree(req); } +EXPORT_SYMBOL(ceph_osdc_release_request); + +static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail) +{ + int i = 0; + + if (needs_trail) + *needs_trail = 0; + while (ops[i].op) { + if (needs_trail && op_needs_trail(ops[i].op)) + *needs_trail = 1; + i++; + } + + return i; +} + +struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, + int flags, + struct ceph_snap_context *snapc, + struct ceph_osd_req_op *ops, + bool use_mempool, + gfp_t gfp_flags, + struct page **pages, + struct bio *bio) +{ + struct ceph_osd_request *req; + struct ceph_msg *msg; + int needs_trail; + int num_op = get_num_ops(ops, &needs_trail); + size_t msg_size = sizeof(struct ceph_osd_request_head); + + msg_size += num_op*sizeof(struct ceph_osd_op); + + if (use_mempool) { + req = mempool_alloc(osdc->req_mempool, gfp_flags); + memset(req, 0, sizeof(*req)); + } else { + req = kzalloc(sizeof(*req), gfp_flags); + } + if (req == NULL) + return NULL; + + req->r_osdc = osdc; + req->r_mempool = use_mempool; + + kref_init(&req->r_kref); + init_completion(&req->r_completion); + init_completion(&req->r_safe_completion); + INIT_LIST_HEAD(&req->r_unsafe_item); + req->r_flags = flags; + + WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); + + /* create reply message */ + if (use_mempool) + msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); + else + msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, + OSD_OPREPLY_FRONT_LEN, gfp_flags); + if (!msg) { + ceph_osdc_put_request(req); + return NULL; + } + req->r_reply = msg; + + /* allocate space for the trailing data */ + if (needs_trail) { + req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags); + if (!req->r_trail) { + ceph_osdc_put_request(req); + return NULL; + } + ceph_pagelist_init(req->r_trail); + } + /* create request message; allow space for oid */ + msg_size += 40; + if (snapc) + msg_size += sizeof(u64) * snapc->num_snaps; + if (use_mempool) + msg = ceph_msgpool_get(&osdc->msgpool_op, 0); + else + msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags); + if (!msg) { + ceph_osdc_put_request(req); + return NULL; + } + + msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); + memset(msg->front.iov_base, 0, msg->front.iov_len); + + req->r_request = msg; + req->r_pages = pages; +#ifdef CONFIG_BLOCK + if (bio) { + req->r_bio = bio; + bio_get(req->r_bio); + } +#endif + + return req; +} +EXPORT_SYMBOL(ceph_osdc_alloc_request); + +static void osd_req_encode_op(struct ceph_osd_request *req, + struct ceph_osd_op *dst, + struct ceph_osd_req_op *src) +{ + dst->op = cpu_to_le16(src->op); + + switch (dst->op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + dst->extent.offset = + cpu_to_le64(src->extent.offset); + dst->extent.length = + cpu_to_le64(src->extent.length); + dst->extent.truncate_size = + cpu_to_le64(src->extent.truncate_size); + dst->extent.truncate_seq = + cpu_to_le32(src->extent.truncate_seq); + break; + + case CEPH_OSD_OP_GETXATTR: + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_CMPXATTR: + BUG_ON(!req->r_trail); + + dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); + dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); + dst->xattr.cmp_op = src->xattr.cmp_op; + dst->xattr.cmp_mode = src->xattr.cmp_mode; + ceph_pagelist_append(req->r_trail, src->xattr.name, + src->xattr.name_len); + ceph_pagelist_append(req->r_trail, src->xattr.val, + src->xattr.value_len); + break; + case CEPH_OSD_OP_CALL: + BUG_ON(!req->r_trail); + + dst->cls.class_len = src->cls.class_len; + dst->cls.method_len = src->cls.method_len; + dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); + + ceph_pagelist_append(req->r_trail, src->cls.class_name, + src->cls.class_len); + ceph_pagelist_append(req->r_trail, src->cls.method_name, + src->cls.method_len); + ceph_pagelist_append(req->r_trail, src->cls.indata, + src->cls.indata_len); + break; + case CEPH_OSD_OP_ROLLBACK: + dst->snap.snapid = cpu_to_le64(src->snap.snapid); + break; + case CEPH_OSD_OP_STARTSYNC: + break; + default: + pr_err("unrecognized osd opcode %d\n", dst->op); + WARN_ON(1); + break; + } + dst->payload_len = cpu_to_le32(src->payload_len); +} + +/* + * build new request AND message + * + */ +void ceph_osdc_build_request(struct ceph_osd_request *req, + u64 off, u64 *plen, + struct ceph_osd_req_op *src_ops, + struct ceph_snap_context *snapc, + struct timespec *mtime, + const char *oid, + int oid_len) +{ + struct ceph_msg *msg = req->r_request; + struct ceph_osd_request_head *head; + struct ceph_osd_req_op *src_op; + struct ceph_osd_op *op; + void *p; + int num_op = get_num_ops(src_ops, NULL); + size_t msg_size = sizeof(*head) + num_op*sizeof(*op); + int flags = req->r_flags; + u64 data_len = 0; + int i; + + head = msg->front.iov_base; + op = (void *)(head + 1); + p = (void *)(op + num_op); + + req->r_snapc = ceph_get_snap_context(snapc); + + head->client_inc = cpu_to_le32(1); /* always, for now. */ + head->flags = cpu_to_le32(flags); + if (flags & CEPH_OSD_FLAG_WRITE) + ceph_encode_timespec(&head->mtime, mtime); + head->num_ops = cpu_to_le16(num_op); + + + /* fill in oid */ + head->object_len = cpu_to_le32(oid_len); + memcpy(p, oid, oid_len); + p += oid_len; + + src_op = src_ops; + while (src_op->op) { + osd_req_encode_op(req, op, src_op); + src_op++; + op++; + } + + if (req->r_trail) + data_len += req->r_trail->length; + + if (snapc) { + head->snap_seq = cpu_to_le64(snapc->seq); + head->num_snaps = cpu_to_le32(snapc->num_snaps); + for (i = 0; i < snapc->num_snaps; i++) { + put_unaligned_le64(snapc->snaps[i], p); + p += sizeof(u64); + } + } + + if (flags & CEPH_OSD_FLAG_WRITE) { + req->r_request->hdr.data_off = cpu_to_le16(off); + req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len); + } else if (data_len) { + req->r_request->hdr.data_off = 0; + req->r_request->hdr.data_len = cpu_to_le32(data_len); + } + + BUG_ON(p > msg->front.iov_base + msg->front.iov_len); + msg_size = p - msg->front.iov_base; + msg->front.iov_len = msg_size; + msg->hdr.front_len = cpu_to_le32(msg_size); + return; +} +EXPORT_SYMBOL(ceph_osdc_build_request); /* * build new request AND message, calculate layout, and adjust file @@ -131,110 +421,40 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, struct timespec *mtime, bool use_mempool, int num_reply) { + struct ceph_osd_req_op ops[3]; struct ceph_osd_request *req; - struct ceph_msg *msg; - struct ceph_osd_request_head *head; - struct ceph_osd_op *op; - void *p; - int num_op = 1 + do_sync; - size_t msg_size = sizeof(*head) + num_op*sizeof(*op); - int i; - if (use_mempool) { - req = mempool_alloc(osdc->req_mempool, GFP_NOFS); - memset(req, 0, sizeof(*req)); - } else { - req = kzalloc(sizeof(*req), GFP_NOFS); - } - if (req == NULL) - return NULL; - - req->r_osdc = osdc; - req->r_mempool = use_mempool; - kref_init(&req->r_kref); - init_completion(&req->r_completion); - init_completion(&req->r_safe_completion); - INIT_LIST_HEAD(&req->r_unsafe_item); - req->r_flags = flags; - - WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); - - /* create reply message */ - if (use_mempool) - msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); - else - msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, - OSD_OPREPLY_FRONT_LEN, GFP_NOFS); - if (!msg) { - ceph_osdc_put_request(req); - return NULL; - } - req->r_reply = msg; - - /* create request message; allow space for oid */ - msg_size += 40; - if (snapc) - msg_size += sizeof(u64) * snapc->num_snaps; - if (use_mempool) - msg = ceph_msgpool_get(&osdc->msgpool_op, 0); - else - msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS); - if (!msg) { - ceph_osdc_put_request(req); - return NULL; - } - msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); - memset(msg->front.iov_base, 0, msg->front.iov_len); - head = msg->front.iov_base; - op = (void *)(head + 1); - p = (void *)(op + num_op); - - req->r_request = msg; - req->r_snapc = ceph_get_snap_context(snapc); - - head->client_inc = cpu_to_le32(1); /* always, for now. */ - head->flags = cpu_to_le32(flags); - if (flags & CEPH_OSD_FLAG_WRITE) - ceph_encode_timespec(&head->mtime, mtime); - head->num_ops = cpu_to_le16(num_op); - op->op = cpu_to_le16(opcode); - - /* calculate max write size */ - calc_layout(osdc, vino, layout, off, plen, req); - req->r_file_layout = *layout; /* keep a copy */ - - if (flags & CEPH_OSD_FLAG_WRITE) { - req->r_request->hdr.data_off = cpu_to_le16(off); - req->r_request->hdr.data_len = cpu_to_le32(*plen); - op->payload_len = cpu_to_le32(*plen); - } - op->extent.truncate_size = cpu_to_le64(truncate_size); - op->extent.truncate_seq = cpu_to_le32(truncate_seq); - - /* fill in oid */ - head->object_len = cpu_to_le32(req->r_oid_len); - memcpy(p, req->r_oid, req->r_oid_len); - p += req->r_oid_len; + ops[0].op = opcode; + ops[0].extent.truncate_seq = truncate_seq; + ops[0].extent.truncate_size = truncate_size; + ops[0].payload_len = 0; if (do_sync) { - op++; - op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC); - } - if (snapc) { - head->snap_seq = cpu_to_le64(snapc->seq); - head->num_snaps = cpu_to_le32(snapc->num_snaps); - for (i = 0; i < snapc->num_snaps; i++) { - put_unaligned_le64(snapc->snaps[i], p); - p += sizeof(u64); - } - } + ops[1].op = CEPH_OSD_OP_STARTSYNC; + ops[1].payload_len = 0; + ops[2].op = 0; + } else + ops[1].op = 0; + + req = ceph_osdc_alloc_request(osdc, flags, + snapc, ops, + use_mempool, + GFP_NOFS, NULL, NULL); + if (IS_ERR(req)) + return req; + + /* calculate max write size */ + calc_layout(osdc, vino, layout, off, plen, req, ops); + req->r_file_layout = *layout; /* keep a copy */ + + ceph_osdc_build_request(req, off, plen, ops, + snapc, + mtime, + req->r_oid, req->r_oid_len); - BUG_ON(p > msg->front.iov_base + msg->front.iov_len); - msg_size = p - msg->front.iov_base; - msg->front.iov_len = msg_size; - msg->hdr.front_len = cpu_to_le32(msg_size); return req; } +EXPORT_SYMBOL(ceph_osdc_new_request); /* * We keep osd requests in an rbtree, sorted by ->r_tid. @@ -389,7 +609,7 @@ static void __move_osd_to_lru(struct ceph_osd_client *osdc, dout("__move_osd_to_lru %p\n", osd); BUG_ON(!list_empty(&osd->o_osd_lru)); list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); - osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ; + osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ; } static void __remove_osd_from_lru(struct ceph_osd *osd) @@ -483,7 +703,7 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) static void __schedule_osd_timeout(struct ceph_osd_client *osdc) { schedule_delayed_work(&osdc->timeout_work, - osdc->client->mount_args->osd_keepalive_timeout * HZ); + osdc->client->options->osd_keepalive_timeout * HZ); } static void __cancel_osd_timeout(struct ceph_osd_client *osdc) @@ -684,9 +904,9 @@ static void handle_timeout(struct work_struct *work) container_of(work, struct ceph_osd_client, timeout_work.work); struct ceph_osd_request *req, *last_req = NULL; struct ceph_osd *osd; - unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ; + unsigned long timeout = osdc->client->options->osd_timeout * HZ; unsigned long keepalive = - osdc->client->mount_args->osd_keepalive_timeout * HZ; + osdc->client->options->osd_keepalive_timeout * HZ; unsigned long last_stamp = 0; struct rb_node *p; struct list_head slow_osds; @@ -773,7 +993,7 @@ static void handle_osds_timeout(struct work_struct *work) container_of(work, struct ceph_osd_client, osds_timeout_work.work); unsigned long delay = - osdc->client->mount_args->osd_idle_ttl * HZ >> 2; + osdc->client->options->osd_idle_ttl * HZ >> 2; dout("osds timeout\n"); down_read(&osdc->map_sem); @@ -1104,6 +1324,10 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, req->r_request->pages = req->r_pages; req->r_request->nr_pages = req->r_num_pages; +#ifdef CONFIG_BLOCK + req->r_request->bio = req->r_bio; +#endif + req->r_request->trail = req->r_trail; register_request(osdc, req); @@ -1131,6 +1355,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, up_read(&osdc->map_sem); return rc; } +EXPORT_SYMBOL(ceph_osdc_start_request); /* * wait for a request to complete @@ -1153,6 +1378,7 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc, dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); return req->r_result; } +EXPORT_SYMBOL(ceph_osdc_wait_request); /* * sync - wait for all in-flight requests to flush. avoid starvation. @@ -1186,6 +1412,7 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc) mutex_unlock(&osdc->request_mutex); dout("sync done (thru tid %llu)\n", last_tid); } +EXPORT_SYMBOL(ceph_osdc_sync); /* * init, shutdown @@ -1211,7 +1438,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); schedule_delayed_work(&osdc->osds_timeout_work, - round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ)); + round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); err = -ENOMEM; osdc->req_mempool = mempool_create_kmalloc_pool(10, @@ -1237,6 +1464,7 @@ out_mempool: out: return err; } +EXPORT_SYMBOL(ceph_osdc_init); void ceph_osdc_stop(struct ceph_osd_client *osdc) { @@ -1251,6 +1479,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) ceph_msgpool_destroy(&osdc->msgpool_op); ceph_msgpool_destroy(&osdc->msgpool_op_reply); } +EXPORT_SYMBOL(ceph_osdc_stop); /* * Read some contiguous pages. If we cross a stripe boundary, shorten @@ -1288,6 +1517,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, dout("readpages result %d\n", rc); return rc; } +EXPORT_SYMBOL(ceph_osdc_readpages); /* * do a synchronous write on N pages @@ -1330,6 +1560,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, dout("writepages result %d\n", rc); return rc; } +EXPORT_SYMBOL(ceph_osdc_writepages); /* * handle incoming message @@ -1420,6 +1651,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, } m->pages = req->r_pages; m->nr_pages = req->r_num_pages; +#ifdef CONFIG_BLOCK + m->bio = req->r_bio; +#endif } *skip = 0; req->r_con_filling_msg = ceph_con_get(con); diff --git a/fs/ceph/osdmap.c b/net/ceph/osdmap.c similarity index 97% rename from fs/ceph/osdmap.c rename to net/ceph/osdmap.c index e31f118f1392..d73f3f6efa36 100644 --- a/fs/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1,14 +1,15 @@ -#include "ceph_debug.h" +#include +#include #include #include -#include "super.h" -#include "osdmap.h" -#include "crush/hash.h" -#include "crush/mapper.h" -#include "decode.h" +#include +#include +#include +#include +#include char *ceph_osdmap_state_str(char *str, int len, int state) { @@ -417,6 +418,20 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) return NULL; } +int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name) +{ + struct rb_node *rbp; + + for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) { + struct ceph_pg_pool_info *pi = + rb_entry(rbp, struct ceph_pg_pool_info, node); + if (pi->name && strcmp(pi->name, name) == 0) + return pi->id; + } + return -ENOENT; +} +EXPORT_SYMBOL(ceph_pg_poolid_by_name); + static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) { rb_erase(&pi->node, root); @@ -966,6 +981,7 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); } +EXPORT_SYMBOL(ceph_calc_file_object_mapping); /* * calculate an object layout (i.e. pgid) from an oid, @@ -1011,6 +1027,7 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, ol->ol_stripe_unit = fl->fl_object_stripe_unit; return 0; } +EXPORT_SYMBOL(ceph_calc_object_layout); /* * Calculate raw osd vector for the given pgid. Return pointer to osd @@ -1108,3 +1125,4 @@ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) return osds[i]; return -1; } +EXPORT_SYMBOL(ceph_calc_pg_primary); diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c new file mode 100644 index 000000000000..13cb409a7bba --- /dev/null +++ b/net/ceph/pagelist.c @@ -0,0 +1,154 @@ + +#include +#include +#include +#include +#include + +static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) +{ + if (pl->mapped_tail) { + struct page *page = list_entry(pl->head.prev, struct page, lru); + kunmap(page); + pl->mapped_tail = NULL; + } +} + +int ceph_pagelist_release(struct ceph_pagelist *pl) +{ + ceph_pagelist_unmap_tail(pl); + while (!list_empty(&pl->head)) { + struct page *page = list_first_entry(&pl->head, struct page, + lru); + list_del(&page->lru); + __free_page(page); + } + ceph_pagelist_free_reserve(pl); + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_release); + +static int ceph_pagelist_addpage(struct ceph_pagelist *pl) +{ + struct page *page; + + if (!pl->num_pages_free) { + page = __page_cache_alloc(GFP_NOFS); + } else { + page = list_first_entry(&pl->free_list, struct page, lru); + list_del(&page->lru); + --pl->num_pages_free; + } + if (!page) + return -ENOMEM; + pl->room += PAGE_SIZE; + ceph_pagelist_unmap_tail(pl); + list_add_tail(&page->lru, &pl->head); + pl->mapped_tail = kmap(page); + return 0; +} + +int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len) +{ + while (pl->room < len) { + size_t bit = pl->room; + int ret; + + memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), + buf, bit); + pl->length += bit; + pl->room -= bit; + buf += bit; + len -= bit; + ret = ceph_pagelist_addpage(pl); + if (ret) + return ret; + } + + memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len); + pl->length += len; + pl->room -= len; + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_append); + +/** + * Allocate enough pages for a pagelist to append the given amount + * of data without without allocating. + * Returns: 0 on success, -ENOMEM on error. + */ +int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space) +{ + if (space <= pl->room) + return 0; + space -= pl->room; + space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT; /* conv to num pages */ + + while (space > pl->num_pages_free) { + struct page *page = __page_cache_alloc(GFP_NOFS); + if (!page) + return -ENOMEM; + list_add_tail(&page->lru, &pl->free_list); + ++pl->num_pages_free; + } + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_reserve); + +/** + * Free any pages that have been preallocated. + */ +int ceph_pagelist_free_reserve(struct ceph_pagelist *pl) +{ + while (!list_empty(&pl->free_list)) { + struct page *page = list_first_entry(&pl->free_list, + struct page, lru); + list_del(&page->lru); + __free_page(page); + --pl->num_pages_free; + } + BUG_ON(pl->num_pages_free); + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_free_reserve); + +/** + * Create a truncation point. + */ +void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c) +{ + c->pl = pl; + c->page_lru = pl->head.prev; + c->room = pl->room; +} +EXPORT_SYMBOL(ceph_pagelist_set_cursor); + +/** + * Truncate a pagelist to the given point. Move extra pages to reserve. + * This won't sleep. + * Returns: 0 on success, + * -EINVAL if the pagelist doesn't match the trunc point pagelist + */ +int ceph_pagelist_truncate(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c) +{ + struct page *page; + + if (pl != c->pl) + return -EINVAL; + ceph_pagelist_unmap_tail(pl); + while (pl->head.prev != c->page_lru) { + page = list_entry(pl->head.prev, struct page, lru); + list_del(&page->lru); /* remove from pagelist */ + list_add_tail(&page->lru, &pl->free_list); /* add to reserve */ + ++pl->num_pages_free; + } + pl->room = c->room; + if (!list_empty(&pl->head)) { + page = list_entry(pl->head.prev, struct page, lru); + pl->mapped_tail = kmap(page); + } + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_truncate); diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c new file mode 100644 index 000000000000..54caf0687155 --- /dev/null +++ b/net/ceph/pagevec.c @@ -0,0 +1,223 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include + +/* + * build a vector of user pages + */ +struct page **ceph_get_direct_page_vector(const char __user *data, + int num_pages, + loff_t off, size_t len) +{ + struct page **pages; + int rc; + + pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); + if (!pages) + return ERR_PTR(-ENOMEM); + + down_read(¤t->mm->mmap_sem); + rc = get_user_pages(current, current->mm, (unsigned long)data, + num_pages, 0, 0, pages, NULL); + up_read(¤t->mm->mmap_sem); + if (rc < 0) + goto fail; + return pages; + +fail: + kfree(pages); + return ERR_PTR(rc); +} +EXPORT_SYMBOL(ceph_get_direct_page_vector); + +void ceph_put_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + put_page(pages[i]); + kfree(pages); +} +EXPORT_SYMBOL(ceph_put_page_vector); + +void ceph_release_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + __free_pages(pages[i], 0); + kfree(pages); +} +EXPORT_SYMBOL(ceph_release_page_vector); + +/* + * allocate a vector new pages + */ +struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) +{ + struct page **pages; + int i; + + pages = kmalloc(sizeof(*pages) * num_pages, flags); + if (!pages) + return ERR_PTR(-ENOMEM); + for (i = 0; i < num_pages; i++) { + pages[i] = __page_cache_alloc(flags); + if (pages[i] == NULL) { + ceph_release_page_vector(pages, i); + return ERR_PTR(-ENOMEM); + } + } + return pages; +} +EXPORT_SYMBOL(ceph_alloc_page_vector); + +/* + * copy user data into a page vector + */ +int ceph_copy_user_to_page_vector(struct page **pages, + const char __user *data, + loff_t off, size_t len) +{ + int i = 0; + int po = off & ~PAGE_CACHE_MASK; + int left = len; + int l, bad; + + while (left > 0) { + l = min_t(int, PAGE_CACHE_SIZE-po, left); + bad = copy_from_user(page_address(pages[i]) + po, data, l); + if (bad == l) + return -EFAULT; + data += l - bad; + left -= l - bad; + po += l - bad; + if (po == PAGE_CACHE_SIZE) { + po = 0; + i++; + } + } + return len; +} +EXPORT_SYMBOL(ceph_copy_user_to_page_vector); + +int ceph_copy_to_page_vector(struct page **pages, + const char *data, + loff_t off, size_t len) +{ + int i = 0; + size_t po = off & ~PAGE_CACHE_MASK; + size_t left = len; + size_t l; + + while (left > 0) { + l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + memcpy(page_address(pages[i]) + po, data, l); + data += l; + left -= l; + po += l; + if (po == PAGE_CACHE_SIZE) { + po = 0; + i++; + } + } + return len; +} +EXPORT_SYMBOL(ceph_copy_to_page_vector); + +int ceph_copy_from_page_vector(struct page **pages, + char *data, + loff_t off, size_t len) +{ + int i = 0; + size_t po = off & ~PAGE_CACHE_MASK; + size_t left = len; + size_t l; + + while (left > 0) { + l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + memcpy(data, page_address(pages[i]) + po, l); + data += l; + left -= l; + po += l; + if (po == PAGE_CACHE_SIZE) { + po = 0; + i++; + } + } + return len; +} +EXPORT_SYMBOL(ceph_copy_from_page_vector); + +/* + * copy user data from a page vector into a user pointer + */ +int ceph_copy_page_vector_to_user(struct page **pages, + char __user *data, + loff_t off, size_t len) +{ + int i = 0; + int po = off & ~PAGE_CACHE_MASK; + int left = len; + int l, bad; + + while (left > 0) { + l = min_t(int, left, PAGE_CACHE_SIZE-po); + bad = copy_to_user(data, page_address(pages[i]) + po, l); + if (bad == l) + return -EFAULT; + data += l - bad; + left -= l - bad; + if (po) { + po += l - bad; + if (po == PAGE_CACHE_SIZE) + po = 0; + } + i++; + } + return len; +} +EXPORT_SYMBOL(ceph_copy_page_vector_to_user); + +/* + * Zero an extent within a page vector. Offset is relative to the + * start of the first page. + */ +void ceph_zero_page_vector_range(int off, int len, struct page **pages) +{ + int i = off >> PAGE_CACHE_SHIFT; + + off &= ~PAGE_CACHE_MASK; + + dout("zero_page_vector_page %u~%u\n", off, len); + + /* leading partial page? */ + if (off) { + int end = min((int)PAGE_CACHE_SIZE, off + len); + dout("zeroing %d %p head from %d\n", i, pages[i], + (int)off); + zero_user_segment(pages[i], off, end); + len -= (end - off); + i++; + } + while (len >= PAGE_CACHE_SIZE) { + dout("zeroing %d %p len=%d\n", i, pages[i], len); + zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); + len -= PAGE_CACHE_SIZE; + i++; + } + /* trailing partial page? */ + if (len) { + dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len); + zero_user_segment(pages[i], 0, len); + } +} +EXPORT_SYMBOL(ceph_zero_page_vector_range); +