diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index d2c6a5ccf0f5..b19b6a03f91c 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt @@ -158,6 +158,20 @@ Mount Options copies. Currently, it's only used in copy_file_range, which will revert to the default VFS implementation if this option is used. + recover_session= + Set auto reconnect mode in the case where the client is blacklisted. The + available modes are "no" and "clean". The default is "no". + + * no: never attempt to reconnect when client detects that it has been + blacklisted. Operations will generally fail after being blacklisted. + + * clean: client reconnects to the ceph cluster automatically when it + detects that it has been blacklisted. During reconnect, client drops + dirty data/metadata, invalidates page caches and writable file handles. + After reconnect, file locks become stale because the MDS loses track + of them. If an inode contains any stale file locks, read/write on the + inode is not allowed until applications release all stale file locks. + More Information ================ diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c8fb886aebd4..7c4350c0fb77 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1754,8 +1754,6 @@ static struct rbd_img_request *rbd_img_request_create( mutex_init(&img_request->state_mutex); kref_init(&img_request->kref); - dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev, - obj_op_name(op_type), img_request); return img_request; } @@ -2944,6 +2942,9 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) __set_bit(IMG_REQ_CHILD, &child_img_req->flags); child_img_req->obj_request = obj_req; + dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req, + obj_req); + if (!rbd_img_is_write(img_req)) { switch (img_req->data_type) { case OBJ_REQUEST_BIO: @@ -4877,6 +4878,9 @@ static void rbd_queue_workfn(struct work_struct *work) img_request->rq = rq; snapc = NULL; /* img_request consumes a ref */ + dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev, + img_request, obj_op_name(op_type), offset, length); + if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) result = rbd_img_fill_nodata(img_request, offset, length); else @@ -5669,17 +5673,20 @@ static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) { + size_t size; void *reply_buf; int ret; void *p; - reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); + /* Response will be an encoded string, which includes a length */ + size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX; + reply_buf = kzalloc(size, GFP_KERNEL); if (!reply_buf) return -ENOMEM; ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, &rbd_dev->header_oloc, "get_object_prefix", - NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); + NULL, 0, reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -6696,7 +6703,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) dout("rbd id object name is %s\n", oid.name); /* Response will be an encoded string, which includes a length */ - size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; response = kzalloc(size, GFP_NOIO); if (!response) { @@ -6708,7 +6714,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, "get_id", NULL, 0, - response, RBD_IMAGE_ID_LEN_MAX); + response, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret == -ENOENT) { image_id = kstrdup("", GFP_KERNEL); diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index a699e320393f..c1da294418d1 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ - export.o caps.o snap.o xattr.o quota.o \ + export.o caps.o snap.o xattr.o quota.o io.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ debugfs.o diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b3c8b886bf64..7ab616601141 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -189,8 +189,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page) { struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = - &ceph_inode_to_client(inode)->client->osdc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); int err = 0; u64 off = page_offset(page); u64 len = PAGE_SIZE; @@ -219,8 +218,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); - err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - off, &len, + err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), + &ci->i_layout, off, &len, ci->i_truncate_seq, ci->i_truncate_size, &page, 1, 0); if (err == -ENOENT) @@ -228,6 +227,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) if (err < 0) { SetPageError(page); ceph_fscache_readpage_cancel(inode, page); + if (err == -EBLACKLISTED) + fsc->blacklisted = true; goto out; } if (err < PAGE_SIZE) @@ -266,6 +267,8 @@ static void finish_read(struct ceph_osd_request *req) int i; dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); + if (rc == -EBLACKLISTED) + ceph_inode_to_client(inode)->blacklisted = true; /* unlock all pages, zeroing any data we didn't read */ osd_data = osd_req_op_extent_osd_data(req, 0); @@ -323,7 +326,8 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, /* caller of readpages does not hold buffer and read caps * (fadvise, madvise and readahead cases) */ int want = CEPH_CAP_FILE_CACHE; - ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got); + ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, + true, &got); if (ret < 0) { dout("start_read %p, error getting cap\n", inode); } else if (!(got & want)) { @@ -569,7 +573,7 @@ static u64 get_writepages_data_length(struct inode *inode, /* * Write a single page, but leave the page locked. * - * If we get a write error, set the page error bit, but still adjust the + * If we get a write error, mark the mapping for error, but still adjust the * dirty page accounting (i.e., page is no longer dirty). */ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) @@ -640,9 +644,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) end_page_writeback(page); return err; } + if (err == -EBLACKLISTED) + fsc->blacklisted = true; dout("writepage setting page/mapping error %d %p\n", err, page); - SetPageError(page); mapping_set_error(&inode->i_data, err); wbc->pages_skipped++; } else { @@ -679,23 +684,6 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) return err; } -/* - * lame release_pages helper. release_pages() isn't exported to - * modules. - */ -static void ceph_release_pages(struct page **pages, int num) -{ - struct pagevec pvec; - int i; - - pagevec_init(&pvec); - for (i = 0; i < num; i++) { - if (pagevec_add(&pvec, pages[i]) == 0) - pagevec_release(&pvec); - } - pagevec_release(&pvec); -} - /* * async writeback completion handler. * @@ -720,6 +708,8 @@ static void writepages_finish(struct ceph_osd_request *req) if (rc < 0) { mapping_set_error(mapping, rc); ceph_set_error_write(ci); + if (rc == -EBLACKLISTED) + fsc->blacklisted = true; } else { ceph_clear_error_write(ci); } @@ -769,7 +759,7 @@ static void writepages_finish(struct ceph_osd_request *req) dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", inode, osd_data->length, rc >= 0 ? num_pages : 0); - ceph_release_pages(osd_data->pages, num_pages); + release_pages(osd_data->pages, num_pages); } ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); @@ -1452,7 +1442,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) want = CEPH_CAP_FILE_CACHE; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, + &got, &pinned_page); if (err < 0) goto out_restore; @@ -1540,6 +1531,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) if (!prealloc_cf) return VM_FAULT_OOM; + sb_start_pagefault(inode->i_sb); ceph_block_sigs(&oldset); if (ci->i_inline_version != CEPH_INLINE_NONE) { @@ -1568,7 +1560,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got, NULL); if (err < 0) goto out_free; @@ -1614,6 +1606,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) ceph_put_cap_refs(ci, got); out_free: ceph_restore_sigs(&oldset); + sb_end_pagefault(inode->i_sb); ceph_free_cap_flush(prealloc_cf); if (err < 0) ret = vmf_error(err); @@ -1946,12 +1939,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, if (err >= 0 || err == -ENOENT) have |= POOL_READ; - else if (err != -EPERM) + else if (err != -EPERM) { + if (err == -EBLACKLISTED) + fsc->blacklisted = true; goto out_unlock; + } if (err2 == 0 || err2 == -EEXIST) have |= POOL_WRITE; else if (err2 != -EPERM) { + if (err2 == -EBLACKLISTED) + fsc->blacklisted = true; err = err2; goto out_unlock; } @@ -1989,10 +1987,11 @@ out: return err; } -int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) +int ceph_pool_perm_check(struct inode *inode, int need) { - s64 pool; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_string *pool_ns; + s64 pool; int ret, flags; if (ci->i_vino.snap != CEPH_NOSNAP) { @@ -2004,7 +2003,7 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) return 0; } - if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), + if (ceph_test_mount_opt(ceph_inode_to_client(inode), NOPOOLPERM)) return 0; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index bc90cf6ad7ed..b2ec29eeb4c4 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -6,6 +6,8 @@ * Written by Milosz Tanski (milosz@adfin.com) */ +#include + #include "super.h" #include "cache.h" diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ce0f5658720a..d3b9c9d5c1bd 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -457,37 +457,6 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) return cap; } -/* - * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. - */ -static int __ceph_get_cap_mds(struct ceph_inode_info *ci) -{ - struct ceph_cap *cap; - int mds = -1; - struct rb_node *p; - - /* prefer mds with WR|BUFFER|EXCL caps */ - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { - cap = rb_entry(p, struct ceph_cap, ci_node); - mds = cap->mds; - if (cap->issued & (CEPH_CAP_FILE_WR | - CEPH_CAP_FILE_BUFFER | - CEPH_CAP_FILE_EXCL)) - break; - } - return mds; -} - -int ceph_get_cap_mds(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int mds; - spin_lock(&ci->i_ceph_lock); - mds = __ceph_get_cap_mds(ceph_inode(inode)); - spin_unlock(&ci->i_ceph_lock); - return mds; -} - /* * Called under i_ceph_lock. */ @@ -628,7 +597,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, /* * Add a capability under the given MDS session. * - * Caller should hold session snap_rwsem (read) and s_mutex. + * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock * * @fmode is the open file mode, if we are opening a file, otherwise * it is < 0. (This is so we can atomically add the cap and add an @@ -645,6 +614,9 @@ void ceph_add_cap(struct inode *inode, struct ceph_cap *cap; int mds = session->s_mds; int actual_wanted; + u32 gen; + + lockdep_assert_held(&ci->i_ceph_lock); dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, session->s_mds, cap_id, ceph_cap_string(issued), seq); @@ -656,6 +628,10 @@ void ceph_add_cap(struct inode *inode, if (fmode >= 0) wanted |= ceph_caps_for_mode(fmode); + spin_lock(&session->s_gen_ttl_lock); + gen = session->s_cap_gen; + spin_unlock(&session->s_gen_ttl_lock); + cap = __get_cap_for_mds(ci, mds); if (!cap) { cap = *new_cap; @@ -681,7 +657,7 @@ void ceph_add_cap(struct inode *inode, list_move_tail(&cap->session_caps, &session->s_caps); spin_unlock(&session->s_cap_lock); - if (cap->cap_gen < session->s_cap_gen) + if (cap->cap_gen < gen) cap->issued = cap->implemented = CEPH_CAP_PIN; /* @@ -775,7 +751,7 @@ void ceph_add_cap(struct inode *inode, cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; - cap->cap_gen = session->s_cap_gen; + cap->cap_gen = gen; if (fmode >= 0) __ceph_get_fmode(ci, fmode); @@ -1284,10 +1260,6 @@ void __ceph_remove_caps(struct ceph_inode_info *ci) * Make note of max_size reported/requested from mds, revoked caps * that have now been implemented. * - * Make half-hearted attempt ot to invalidate page cache if we are - * dropping RDCACHE. Note that this will leave behind locked pages - * that we'll then need to deal with elsewhere. - * * Return non-zero if delayed release, or we experienced an error * such that the caller should requeue + retry later. * @@ -1746,11 +1718,11 @@ static bool __finish_cap_flush(struct ceph_mds_client *mdsc, * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. * - * Called under i_ceph_lock. + * Called under i_ceph_lock. Returns the flush tid. */ -static int __mark_caps_flushing(struct inode *inode, +static u64 __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session, bool wake, - u64 *flush_tid, u64 *oldest_flush_tid) + u64 *oldest_flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); @@ -1789,8 +1761,7 @@ static int __mark_caps_flushing(struct inode *inode, list_add_tail(&cf->i_list, &ci->i_cap_flush_list); - *flush_tid = cf->tid; - return flushing; + return cf->tid; } /* @@ -2028,11 +1999,6 @@ retry_locked: } ack: - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { - dout(" skipping %p I_NOFLUSH set\n", inode); - continue; - } - if (session && session != cap->session) { dout("oops, wrong session %p mutex\n", session); mutex_unlock(&session->s_mutex); @@ -2080,9 +2046,9 @@ ack: } if (cap == ci->i_auth_cap && ci->i_dirty_caps) { - flushing = __mark_caps_flushing(inode, session, false, - &flush_tid, - &oldest_flush_tid); + flushing = ci->i_dirty_caps; + flush_tid = __mark_caps_flushing(inode, session, false, + &oldest_flush_tid); } else { flushing = 0; flush_tid = 0; @@ -2130,16 +2096,11 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) retry: spin_lock(&ci->i_ceph_lock); retry_locked: - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { - spin_unlock(&ci->i_ceph_lock); - dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); - goto out; - } if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; int delayed; - if (!session || session != cap->session) { + if (session != cap->session) { spin_unlock(&ci->i_ceph_lock); if (session) mutex_unlock(&session->s_mutex); @@ -2161,8 +2122,9 @@ retry_locked: goto retry_locked; } - flushing = __mark_caps_flushing(inode, session, true, - &flush_tid, &oldest_flush_tid); + flushing = ci->i_dirty_caps; + flush_tid = __mark_caps_flushing(inode, session, true, + &oldest_flush_tid); /* __send_cap drops i_ceph_lock */ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, @@ -2261,35 +2223,45 @@ static int unsafe_request_wait(struct inode *inode) int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { + struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); u64 flush_tid; - int ret; + int ret, err; int dirty; dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); ret = file_write_and_wait_range(file, start, end); - if (ret < 0) - goto out; - if (datasync) goto out; dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); - ret = unsafe_request_wait(inode); + err = unsafe_request_wait(inode); /* * only wait on non-file metadata writeback (the mds * can recover size and mtime, so we don't need to * wait for that) */ - if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { - ret = wait_event_interruptible(ci->i_cap_wq, + if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { + err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } + + if (err < 0) + ret = err; + + if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) { + spin_lock(&file->f_lock); + err = errseq_check_and_advance(&ci->i_meta_err, + &fi->meta_err); + spin_unlock(&file->f_lock); + if (err < 0) + ret = err; + } out: dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; @@ -2560,10 +2532,15 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got, * * FIXME: how does a 0 return differ from -EAGAIN? */ -static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, - loff_t endoff, bool nonblock, int *got) +enum { + NON_BLOCKING = 1, + CHECK_FILELOCK = 2, +}; + +static int try_get_cap_refs(struct inode *inode, int need, int want, + loff_t endoff, int flags, int *got) { - struct inode *inode = &ci->vfs_inode; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; int ret = 0; int have, implemented; @@ -2576,6 +2553,13 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, again: spin_lock(&ci->i_ceph_lock); + if ((flags & CHECK_FILELOCK) && + (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { + dout("try_get_cap_refs %p error filelock\n", inode); + ret = -EIO; + goto out_unlock; + } + /* make sure file is actually open */ file_wanted = __ceph_caps_file_wanted(ci); if ((file_wanted & need) != need) { @@ -2637,7 +2621,7 @@ again: * we can not call down_read() when * task isn't in TASK_RUNNING state */ - if (nonblock) { + if (flags & NON_BLOCKING) { ret = -EAGAIN; goto out_unlock; } @@ -2731,18 +2715,19 @@ static void check_max_size(struct inode *inode, loff_t endoff) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } -int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, +int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got) { int ret; BUG_ON(need & ~CEPH_CAP_FILE_RD); BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); - ret = ceph_pool_perm_check(ci, need); + ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; - ret = try_get_cap_refs(ci, need, want, 0, nonblock, got); + ret = try_get_cap_refs(inode, need, want, 0, + (nonblock ? NON_BLOCKING : 0), got); return ret == -EAGAIN ? 0 : ret; } @@ -2751,30 +2736,40 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ -int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, +int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got, struct page **pinned_page) { - int _got, ret; + struct ceph_file_info *fi = filp->private_data; + struct inode *inode = file_inode(filp); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + int ret, _got, flags; - ret = ceph_pool_perm_check(ci, need); + ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; + if ((fi->fmode & CEPH_FILE_MODE_WR) && + fi->filp_gen != READ_ONCE(fsc->filp_gen)) + return -EBADF; + while (true) { if (endoff > 0) - check_max_size(&ci->vfs_inode, endoff); + check_max_size(inode, endoff); + flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0; _got = 0; - ret = try_get_cap_refs(ci, need, want, endoff, - false, &_got); + ret = try_get_cap_refs(inode, need, want, endoff, + flags, &_got); if (ret == -EAGAIN) continue; if (!ret) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&ci->i_cap_wq, &wait); - while (!(ret = try_get_cap_refs(ci, need, want, endoff, - true, &_got))) { + flags |= NON_BLOCKING; + while (!(ret = try_get_cap_refs(inode, need, want, + endoff, flags, &_got))) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -2786,10 +2781,18 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (ret == -EAGAIN) continue; } + + if ((fi->fmode & CEPH_FILE_MODE_WR) && + fi->filp_gen != READ_ONCE(fsc->filp_gen)) { + if (ret >= 0 && _got) + ceph_put_cap_refs(ci, _got); + return -EBADF; + } + if (ret < 0) { if (ret == -ESTALE) { /* session was killed, try renew caps */ - ret = ceph_renew_caps(&ci->vfs_inode); + ret = ceph_renew_caps(inode); if (ret == 0) continue; } @@ -2798,9 +2801,9 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (ci->i_inline_version != CEPH_INLINE_NONE && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - i_size_read(&ci->vfs_inode) > 0) { + i_size_read(inode) > 0) { struct page *page = - find_get_page(ci->vfs_inode.i_mapping, 0); + find_get_page(inode->i_mapping, 0); if (page) { if (PageUptodate(page)) { *pinned_page = page; @@ -2819,7 +2822,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, * getattr request will bring inline data into * page cache */ - ret = __ceph_do_getattr(&ci->vfs_inode, NULL, + ret = __ceph_do_getattr(inode, NULL, CEPH_STAT_CAP_INLINE_DATA, true); if (ret < 0) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 2eb88ed22993..facb387c2735 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -294,7 +294,6 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { - return 0; } void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 15ff1b09cfa2..b6bfa94332c3 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -35,7 +35,7 @@ struct ceph_nfs_snapfh { static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { - const static int snap_handle_length = + static const int snap_handle_length = sizeof(struct ceph_nfs_snapfh) >> 2; struct ceph_nfs_snapfh *sfh = (void *)rawfh; u64 snapid = ceph_snap(inode); @@ -85,9 +85,9 @@ out: static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { - const static int handle_length = + static const int handle_length = sizeof(struct ceph_nfs_fh) >> 2; - const static int connected_handle_length = + static const int connected_handle_length = sizeof(struct ceph_nfs_confh) >> 2; int type; @@ -458,33 +458,33 @@ static int __get_snap_name(struct dentry *parent, char *name, if (err < 0) goto out; - rinfo = &req->r_reply_info; - for (i = 0; i < rinfo->dir_nr; i++) { - rde = rinfo->dir_entries + i; - BUG_ON(!rde->inode.in); - if (ceph_snap(inode) == - le64_to_cpu(rde->inode.in->snapid)) { - memcpy(name, rde->name, rde->name_len); - name[rde->name_len] = '\0'; - err = 0; - goto out; - } - } + rinfo = &req->r_reply_info; + for (i = 0; i < rinfo->dir_nr; i++) { + rde = rinfo->dir_entries + i; + BUG_ON(!rde->inode.in); + if (ceph_snap(inode) == + le64_to_cpu(rde->inode.in->snapid)) { + memcpy(name, rde->name, rde->name_len); + name[rde->name_len] = '\0'; + err = 0; + goto out; + } + } - if (rinfo->dir_end) - break; + if (rinfo->dir_end) + break; - BUG_ON(rinfo->dir_nr <= 0); - rde = rinfo->dir_entries + (rinfo->dir_nr - 1); - next_offset += rinfo->dir_nr; - last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); - if (!last_name) { - err = -ENOMEM; - goto out; - } + BUG_ON(rinfo->dir_nr <= 0); + rde = rinfo->dir_entries + (rinfo->dir_nr - 1); + next_offset += rinfo->dir_nr; + last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); + if (!last_name) { + err = -ENOMEM; + goto out; + } - ceph_mdsc_put_request(req); - req = NULL; + ceph_mdsc_put_request(req); + req = NULL; } err = -ENOENT; out: diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 685a03cc4b77..d277f71abe0b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -15,6 +15,7 @@ #include "super.h" #include "mds_client.h" #include "cache.h" +#include "io.h" static __le32 ceph_flags_sys2wire(u32 flags) { @@ -201,6 +202,7 @@ out: static int ceph_init_file_info(struct inode *inode, struct file *file, int fmode, bool isdir) { + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi; dout("%s %p %p 0%o (%s)\n", __func__, inode, file, @@ -211,7 +213,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, struct ceph_dir_file_info *dfi = kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); if (!dfi) { - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + ceph_put_fmode(ci, fmode); /* clean up */ return -ENOMEM; } @@ -222,7 +224,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, } else { fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); if (!fi) { - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + ceph_put_fmode(ci, fmode); /* clean up */ return -ENOMEM; } @@ -232,6 +234,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, fi->fmode = fmode; spin_lock_init(&fi->rw_contexts_lock); INIT_LIST_HEAD(&fi->rw_contexts); + fi->meta_err = errseq_sample(&ci->i_meta_err); + fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); return 0; } @@ -695,7 +699,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ceph_release_page_vector(pages, num_pages); } - if (ret <= 0 || off >= i_size || !more) + if (ret < 0) { + if (ret == -EBLACKLISTED) + fsc->blacklisted = true; + break; + } + + if (off >= i_size || !more) break; } @@ -921,7 +931,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, struct ceph_aio_request *aio_req = NULL; int num_pages = 0; int flags; - int ret; + int ret = 0; struct timespec64 mtime = current_time(inode); size_t count = iov_iter_count(iter); loff_t pos = iocb->ki_pos; @@ -935,11 +945,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, (write ? "write" : "read"), file, pos, (unsigned)count, snapc, snapc ? snapc->seq : 0); - ret = filemap_write_and_wait_range(inode->i_mapping, - pos, pos + count - 1); - if (ret < 0) - return ret; - if (write) { int ret2 = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, @@ -1260,7 +1265,8 @@ again: want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, + &got, &pinned_page); if (ret < 0) return ret; @@ -1274,12 +1280,16 @@ again: if (ci->i_inline_version == CEPH_INLINE_NONE) { if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { + ceph_start_io_direct(inode); ret = ceph_direct_read_write(iocb, to, NULL, NULL); + ceph_end_io_direct(inode); if (ret >= 0 && ret < len) retry_op = CHECK_EOF; } else { + ceph_start_io_read(inode); ret = ceph_sync_read(iocb, to, &retry_op); + ceph_end_io_read(inode); } } else { retry_op = READ_INLINE; @@ -1290,7 +1300,9 @@ again: inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); ceph_add_rw_context(fi, &rw_ctx); + ceph_start_io_read(inode); ret = generic_file_read_iter(iocb, to); + ceph_end_io_read(inode); ceph_del_rw_context(fi, &rw_ctx); } dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", @@ -1399,7 +1411,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) return -ENOMEM; retry_snap: - inode_lock(inode); + if (iocb->ki_flags & IOCB_DIRECT) + ceph_start_io_direct(inode); + else + ceph_start_io_write(inode); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -1457,7 +1472,7 @@ retry_snap: else want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count, + err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got, NULL); if (err < 0) goto out; @@ -1470,7 +1485,6 @@ retry_snap: (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { struct ceph_snap_context *snapc; struct iov_iter data; - inode_unlock(inode); spin_lock(&ci->i_ceph_lock); if (__ceph_have_pending_cap_snap(ci)) { @@ -1487,11 +1501,14 @@ retry_snap: /* we might need to revert back to that point */ data = *from; - if (iocb->ki_flags & IOCB_DIRECT) + if (iocb->ki_flags & IOCB_DIRECT) { written = ceph_direct_read_write(iocb, &data, snapc, &prealloc_cf); - else + ceph_end_io_direct(inode); + } else { written = ceph_sync_write(iocb, &data, pos, snapc); + ceph_end_io_write(inode); + } if (written > 0) iov_iter_advance(from, written); ceph_put_snap_context(snapc); @@ -1506,7 +1523,7 @@ retry_snap: written = generic_perform_write(file, from, pos); if (likely(written >= 0)) iocb->ki_pos = pos + written; - inode_unlock(inode); + ceph_end_io_write(inode); } if (written >= 0) { @@ -1541,9 +1558,11 @@ retry_snap: } goto out_unlocked; - out: - inode_unlock(inode); + if (iocb->ki_flags & IOCB_DIRECT) + ceph_end_io_direct(inode); + else + ceph_end_io_write(inode); out_unlocked: ceph_free_cap_flush(prealloc_cf); current->backing_dev_info = NULL; @@ -1781,7 +1800,7 @@ static long ceph_fallocate(struct file *file, int mode, else want = CEPH_CAP_FILE_BUFFER; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); + ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); if (ret < 0) goto unlock; @@ -1810,16 +1829,15 @@ unlock: * src_ci. Two attempts are made to obtain both caps, and an error is return if * this fails; zero is returned on success. */ -static int get_rd_wr_caps(struct ceph_inode_info *src_ci, - loff_t src_endoff, int *src_got, - struct ceph_inode_info *dst_ci, +static int get_rd_wr_caps(struct file *src_filp, int *src_got, + struct file *dst_filp, loff_t dst_endoff, int *dst_got) { int ret = 0; bool retrying = false; retry_caps: - ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, + ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, dst_endoff, dst_got, NULL); if (ret < 0) return ret; @@ -1829,24 +1847,24 @@ retry_caps: * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some * retry dance instead to try to get both capabilities. */ - ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, + ret = ceph_try_get_caps(file_inode(src_filp), + CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, false, src_got); if (ret <= 0) { /* Start by dropping dst_ci caps and getting src_ci caps */ - ceph_put_cap_refs(dst_ci, *dst_got); + ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got); if (retrying) { if (!ret) /* ceph_try_get_caps masks EAGAIN */ ret = -EAGAIN; return ret; } - ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD, - CEPH_CAP_FILE_SHARED, src_endoff, - src_got, NULL); + ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, + CEPH_CAP_FILE_SHARED, -1, src_got, NULL); if (ret < 0) return ret; /*... drop src_ci caps too, and retry */ - ceph_put_cap_refs(src_ci, *src_got); + ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got); retrying = true; goto retry_caps; } @@ -1904,6 +1922,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, struct ceph_inode_info *src_ci = ceph_inode(src_inode); struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); struct ceph_cap_flush *prealloc_cf; + struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode); struct ceph_object_locator src_oloc, dst_oloc; struct ceph_object_id src_oid, dst_oid; loff_t endoff = 0, size; @@ -1913,10 +1932,16 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, int src_got = 0, dst_got = 0, err, dirty; bool do_final_copy = false; - if (src_inode == dst_inode) - return -EINVAL; - if (src_inode->i_sb != dst_inode->i_sb) - return -EXDEV; + if (src_inode->i_sb != dst_inode->i_sb) { + struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode); + + if (ceph_fsid_compare(&src_fsc->client->fsid, + &dst_fsc->client->fsid)) { + dout("Copying files across clusters: src: %pU dst: %pU\n", + &src_fsc->client->fsid, &dst_fsc->client->fsid); + return -EXDEV; + } + } if (ceph_snap(dst_inode) != CEPH_NOSNAP) return -EROFS; @@ -1928,7 +1953,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * efficient). */ - if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM)) + if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) return -EOPNOTSUPP; if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || @@ -1960,8 +1985,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * clients may have dirty data in their caches. And OSDs know nothing * about caps, so they can't safely do the remote object copies. */ - err = get_rd_wr_caps(src_ci, (src_off + len), &src_got, - dst_ci, (dst_off + len), &dst_got); + err = get_rd_wr_caps(src_file, &src_got, + dst_file, (dst_off + len), &dst_got); if (err < 0) { dout("get_rd_wr_caps returned %d\n", err); ret = -EOPNOTSUPP; @@ -2018,9 +2043,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, goto out; } len -= ret; - err = get_rd_wr_caps(src_ci, (src_off + len), - &src_got, dst_ci, - (dst_off + len), &dst_got); + err = get_rd_wr_caps(src_file, &src_got, + dst_file, (dst_off + len), &dst_got); if (err < 0) goto out; err = is_file_size_ok(src_inode, dst_inode, @@ -2044,7 +2068,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, dst_ci->i_vino.ino, dst_objnum); /* Do an object remote copy */ err = ceph_osdc_copy_from( - &ceph_inode_to_client(src_inode)->client->osdc, + &src_fsc->client->osdc, src_ci->i_vino.snap, 0, &src_oid, &src_oloc, CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 18500edefc56..9f135624ae47 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -515,6 +515,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ceph_fscache_inode_init(ci); + ci->i_meta_err = 0; + return &ci->vfs_inode; } @@ -801,7 +803,12 @@ static int fill_inode(struct inode *inode, struct page *locked_page, /* update inode */ inode->i_rdev = le32_to_cpu(info->rdev); - inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + /* directories have fl_stripe_unit set to zero */ + if (le32_to_cpu(info->layout.fl_stripe_unit)) + inode->i_blkbits = + fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + else + inode->i_blkbits = CEPH_BLOCK_SHIFT; __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); @@ -1982,7 +1989,7 @@ static const struct inode_operations ceph_symlink_iops = { int __ceph_setattr(struct inode *inode, struct iattr *attr) { struct ceph_inode_info *ci = ceph_inode(inode); - const unsigned int ia_valid = attr->ia_valid; + unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_cap_flush *prealloc_cf; @@ -2087,6 +2094,26 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; } } + if (ia_valid & ATTR_SIZE) { + dout("setattr %p size %lld -> %lld\n", inode, + inode->i_size, attr->ia_size); + if ((issued & CEPH_CAP_FILE_EXCL) && + attr->ia_size > inode->i_size) { + i_size_write(inode, attr->ia_size); + inode->i_blocks = calc_inode_blocks(attr->ia_size); + ci->i_reported_size = attr->ia_size; + dirtied |= CEPH_CAP_FILE_EXCL; + ia_valid |= ATTR_MTIME; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + attr->ia_size != inode->i_size) { + req->r_args.setattr.size = cpu_to_le64(attr->ia_size); + req->r_args.setattr.old_size = + cpu_to_le64(inode->i_size); + mask |= CEPH_SETATTR_SIZE; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + } + } if (ia_valid & ATTR_MTIME) { dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, @@ -2109,25 +2136,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; } } - if (ia_valid & ATTR_SIZE) { - dout("setattr %p size %lld -> %lld\n", inode, - inode->i_size, attr->ia_size); - if ((issued & CEPH_CAP_FILE_EXCL) && - attr->ia_size > inode->i_size) { - i_size_write(inode, attr->ia_size); - inode->i_blocks = calc_inode_blocks(attr->ia_size); - ci->i_reported_size = attr->ia_size; - dirtied |= CEPH_CAP_FILE_EXCL; - } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - attr->ia_size != inode->i_size) { - req->r_args.setattr.size = cpu_to_le64(attr->ia_size); - req->r_args.setattr.old_size = - cpu_to_le64(inode->i_size); - mask |= CEPH_SETATTR_SIZE; - release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | - CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; - } - } /* these do nothing */ if (ia_valid & ATTR_CTIME) { diff --git a/fs/ceph/io.c b/fs/ceph/io.c new file mode 100644 index 000000000000..97602ea92ff4 --- /dev/null +++ b/fs/ceph/io.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016 Trond Myklebust + * Copyright (c) 2019 Jeff Layton + * + * I/O and data path helper functionality. + * + * Heavily borrowed from equivalent code in fs/nfs/io.c + */ + +#include + +#include +#include +#include +#include + +#include "super.h" +#include "io.h" + +/* Call with exclusively locked inode->i_rwsem */ +static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode) +{ + lockdep_assert_held_write(&inode->i_rwsem); + + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + inode_dio_wait(inode); + } +} + +/** + * ceph_start_io_read - declare the file is being used for buffered reads + * @inode: file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * CEPH_I_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +void +ceph_start_io_read(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + ceph_block_o_direct(ci, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * ceph_end_io_read - declare that the buffered read operation is done + * @inode: file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +ceph_end_io_read(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} + +/** + * ceph_start_io_write - declare the file is being used for buffered writes + * @inode: file inode + * + * Declare that a buffered write operation is about to start, and ensure + * that we block all direct I/O. + */ +void +ceph_start_io_write(struct inode *inode) +{ + down_write(&inode->i_rwsem); + ceph_block_o_direct(ceph_inode(inode), inode); +} + +/** + * ceph_end_io_write - declare that the buffered write operation is done + * @inode: file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void +ceph_end_io_write(struct inode *inode) +{ + up_write(&inode->i_rwsem); +} + +/* Call with exclusively locked inode->i_rwsem */ +static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) +{ + lockdep_assert_held_write(&inode->i_rwsem); + + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags |= CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + /* FIXME: unmap_mapping_range? */ + filemap_write_and_wait(inode->i_mapping); + } +} + +/** + * ceph_end_io_direct - declare the file is being used for direct i/o + * @inode: file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the CEPH_I_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * CEPH_I_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +void +ceph_start_io_direct(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + ceph_block_buffered(ci, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * ceph_end_io_direct - declare that the direct i/o operation is done + * @inode: file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +ceph_end_io_direct(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} diff --git a/fs/ceph/io.h b/fs/ceph/io.h new file mode 100644 index 000000000000..fa594cd77348 --- /dev/null +++ b/fs/ceph/io.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_IO_H +#define _FS_CEPH_IO_H + +void ceph_start_io_read(struct inode *inode); +void ceph_end_io_read(struct inode *inode); +void ceph_start_io_write(struct inode *inode); +void ceph_end_io_write(struct inode *inode); +void ceph_start_io_direct(struct inode *inode); +void ceph_end_io_direct(struct inode *inode); + +#endif /* FS_CEPH_IO_H */ diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 5083e238ad15..544e9e85b120 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -32,14 +32,18 @@ void __init ceph_flock_init(void) static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) { - struct inode *inode = file_inode(src->fl_file); + struct ceph_file_info *fi = dst->fl_file->private_data; + struct inode *inode = file_inode(dst->fl_file); atomic_inc(&ceph_inode(inode)->i_filelock_ref); + atomic_inc(&fi->num_locks); } static void ceph_fl_release_lock(struct file_lock *fl) { + struct ceph_file_info *fi = fl->fl_file->private_data; struct inode *inode = file_inode(fl->fl_file); struct ceph_inode_info *ci = ceph_inode(inode); + atomic_dec(&fi->num_locks); if (atomic_dec_and_test(&ci->i_filelock_ref)) { /* clear error when all locks are released */ spin_lock(&ci->i_ceph_lock); @@ -73,7 +77,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, * window. Caller function will decrease the counter. */ fl->fl_ops = &ceph_fl_lock_ops; - atomic_inc(&ceph_inode(inode)->i_filelock_ref); + fl->fl_ops->fl_copy_lock(fl, NULL); } if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 920e9f048bd8..a8a8f84f3bbf 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -639,7 +639,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_renew_seq = 0; INIT_LIST_HEAD(&s->s_caps); s->s_nr_caps = 0; - s->s_trim_caps = 0; refcount_set(&s->s_ref, 1); INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); @@ -1270,6 +1269,7 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, { struct ceph_mds_request *req; struct rb_node *p; + struct ceph_inode_info *ci; dout("cleanup_session_requests mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); @@ -1278,6 +1278,16 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, struct ceph_mds_request, r_unsafe_item); pr_warn_ratelimited(" dropping unsafe request %llu\n", req->r_tid); + if (req->r_target_inode) { + /* dropping unsafe change of inode's attributes */ + ci = ceph_inode(req->r_target_inode); + errseq_set(&ci->i_meta_err, -EIO); + } + if (req->r_unsafe_dir) { + /* dropping unsafe directory operation */ + ci = ceph_inode(req->r_unsafe_dir); + errseq_set(&ci->i_meta_err, -EIO); + } __unregister_request(mdsc, req); } /* zero r_attempts, so kick_requests() will re-send requests */ @@ -1370,7 +1380,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; struct ceph_inode_info *ci = ceph_inode(inode); LIST_HEAD(to_remove); - bool drop = false; + bool dirty_dropped = false; bool invalidate = false; dout("removing cap %p, ci is %p, inode is %p\n", @@ -1383,9 +1393,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_cap_flush *cf; struct ceph_mds_client *mdsc = fsc->mdsc; - if (ci->i_wrbuffer_ref > 0 && - READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) - invalidate = true; + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (inode->i_data.nrpages > 0) + invalidate = true; + if (ci->i_wrbuffer_ref > 0) + mapping_set_error(&inode->i_data, -EIO); + } while (!list_empty(&ci->i_cap_flush_list)) { cf = list_first_entry(&ci->i_cap_flush_list, @@ -1405,7 +1418,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, inode, ceph_ino(inode)); ci->i_dirty_caps = 0; list_del_init(&ci->i_dirty_item); - drop = true; + dirty_dropped = true; } if (!list_empty(&ci->i_flushing_item)) { pr_warn_ratelimited( @@ -1415,10 +1428,22 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ci->i_flushing_caps = 0; list_del_init(&ci->i_flushing_item); mdsc->num_cap_flushing--; - drop = true; + dirty_dropped = true; } spin_unlock(&mdsc->cap_dirty_lock); + if (dirty_dropped) { + errseq_set(&ci->i_meta_err, -EIO); + + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + } + if (atomic_read(&ci->i_filelock_ref) > 0) { /* make further file lock syscall return -EIO */ ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; @@ -1430,15 +1455,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; } - - if (drop && - ci->i_wrbuffer_ref_head == 0 && - ci->i_wr_ref == 0 && - ci->i_dirty_caps == 0 && - ci->i_flushing_caps == 0) { - ceph_put_snap_context(ci->i_head_snapc); - ci->i_head_snapc = NULL; - } } spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { @@ -1452,7 +1468,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, wake_up_all(&ci->i_cap_wq); if (invalidate) ceph_queue_invalidate(inode); - if (drop) + if (dirty_dropped) iput(inode); return 0; } @@ -1705,11 +1721,11 @@ out: */ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { - struct ceph_mds_session *session = arg; + int *remaining = arg; struct ceph_inode_info *ci = ceph_inode(inode); int used, wanted, oissued, mine; - if (session->s_trim_caps <= 0) + if (*remaining <= 0) return -1; spin_lock(&ci->i_ceph_lock); @@ -1746,7 +1762,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) if (oissued) { /* we aren't the only cap.. just remove us */ __ceph_remove_cap(cap, true); - session->s_trim_caps--; + (*remaining)--; } else { struct dentry *dentry; /* try dropping referring dentries */ @@ -1758,7 +1774,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) d_prune_aliases(inode); count = atomic_read(&inode->i_count); if (count == 1) - session->s_trim_caps--; + (*remaining)--; dout("trim_caps_cb %p cap %p pruned, count now %d\n", inode, cap, count); } else { @@ -1784,12 +1800,12 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc, dout("trim_caps mds%d start: %d / %d, trim %d\n", session->s_mds, session->s_nr_caps, max_caps, trim_caps); if (trim_caps > 0) { - session->s_trim_caps = trim_caps; - ceph_iterate_session_caps(session, trim_caps_cb, session); + int remaining = trim_caps; + + ceph_iterate_session_caps(session, trim_caps_cb, &remaining); dout("trim_caps mds%d done: %d / %d, trimmed %d\n", session->s_mds, session->s_nr_caps, max_caps, - trim_caps - session->s_trim_caps); - session->s_trim_caps = 0; + trim_caps - remaining); } ceph_flush_cap_releases(mdsc, session); @@ -3015,18 +3031,23 @@ bad: pr_err("mdsc_handle_forward decode error err=%d\n", err); } -static int __decode_and_drop_session_metadata(void **p, void *end) +static int __decode_session_metadata(void **p, void *end, + bool *blacklisted) { /* map */ u32 n; + bool err_str; ceph_decode_32_safe(p, end, n, bad); while (n-- > 0) { u32 len; ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); + err_str = !strncmp(*p, "error_string", len); *p += len; ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); + if (err_str && strnstr(*p, "blacklisted", len)) + *blacklisted = true; *p += len; } return 0; @@ -3050,6 +3071,7 @@ static void handle_session(struct ceph_mds_session *session, u64 seq; unsigned long features = 0; int wake = 0; + bool blacklisted = false; /* decode */ ceph_decode_need(&p, end, sizeof(*h), bad); @@ -3062,7 +3084,7 @@ static void handle_session(struct ceph_mds_session *session, if (msg_version >= 3) { u32 len; /* version >= 2, metadata */ - if (__decode_and_drop_session_metadata(&p, end) < 0) + if (__decode_session_metadata(&p, end, &blacklisted) < 0) goto bad; /* version >= 3, feature bits */ ceph_decode_32_safe(&p, end, len, bad); @@ -3149,6 +3171,8 @@ static void handle_session(struct ceph_mds_session *session, session->s_state = CEPH_MDS_SESSION_REJECTED; cleanup_session_requests(mdsc, session); remove_session_caps(session); + if (blacklisted) + mdsc->fsc->blacklisted = true; wake = 2; /* for good measure */ break; @@ -3998,7 +4022,27 @@ static void lock_unlock_sessions(struct ceph_mds_client *mdsc) mutex_unlock(&mdsc->mutex); } +static void maybe_recover_session(struct ceph_mds_client *mdsc) +{ + struct ceph_fs_client *fsc = mdsc->fsc; + if (!ceph_test_mount_opt(fsc, CLEANRECOVER)) + return; + + if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED) + return; + + if (!READ_ONCE(fsc->blacklisted)) + return; + + if (fsc->last_auto_reconnect && + time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30)) + return; + + pr_info("auto reconnect after blacklisted\n"); + fsc->last_auto_reconnect = jiffies; + ceph_force_reconnect(fsc->sb); +} /* * delayed work -- periodically trim expired leases, renew caps with mds @@ -4044,7 +4088,9 @@ static void delayed_work(struct work_struct *work) pr_info("mds%d hung\n", s->s_mds); } } - if (s->s_state < CEPH_MDS_SESSION_OPEN) { + if (s->s_state == CEPH_MDS_SESSION_NEW || + s->s_state == CEPH_MDS_SESSION_RESTARTING || + s->s_state == CEPH_MDS_SESSION_REJECTED) { /* this mds is failed or recovering, just wait */ ceph_put_mds_session(s); continue; @@ -4072,6 +4118,8 @@ static void delayed_work(struct work_struct *work) ceph_trim_snapid_map(mdsc); + maybe_recover_session(mdsc); + schedule_delayed(mdsc); } @@ -4355,7 +4403,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) session = __ceph_lookup_mds_session(mdsc, mds); if (!session) continue; + + if (session->s_state == CEPH_MDS_SESSION_REJECTED) + __unregister_session(mdsc, session); + __wake_requests(mdsc, &session->s_waiting); mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); __close_session(mdsc, session); if (session->s_state == CEPH_MDS_SESSION_CLOSING) { @@ -4364,6 +4417,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) } mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); kick_requests(mdsc, mds); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index f7c8603484fe..5cd131b41d84 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -148,9 +148,9 @@ enum { CEPH_MDS_SESSION_OPENING = 2, CEPH_MDS_SESSION_OPEN = 3, CEPH_MDS_SESSION_HUNG = 4, - CEPH_MDS_SESSION_CLOSING = 5, - CEPH_MDS_SESSION_RESTARTING = 6, - CEPH_MDS_SESSION_RECONNECTING = 7, + CEPH_MDS_SESSION_RESTARTING = 5, + CEPH_MDS_SESSION_RECONNECTING = 6, + CEPH_MDS_SESSION_CLOSING = 7, CEPH_MDS_SESSION_REJECTED = 8, }; @@ -176,7 +176,7 @@ struct ceph_mds_session { spinlock_t s_cap_lock; struct list_head s_caps; /* all caps issued by this session */ struct ceph_cap *s_cap_iterator; - int s_nr_caps, s_trim_caps; + int s_nr_caps; int s_num_cap_releases; int s_cap_reconnect; int s_readonly; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 377fafc76f20..edfd643a8205 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -143,6 +143,7 @@ enum { Opt_snapdirname, Opt_mds_namespace, Opt_fscache_uniq, + Opt_recover_session, Opt_last_string, /* string args above */ Opt_dirstat, @@ -184,6 +185,7 @@ static match_table_t fsopt_tokens = { /* int args above */ {Opt_snapdirname, "snapdirname=%s"}, {Opt_mds_namespace, "mds_namespace=%s"}, + {Opt_recover_session, "recover_session=%s"}, {Opt_fscache_uniq, "fsc=%s"}, /* string args above */ {Opt_dirstat, "dirstat"}, @@ -254,6 +256,17 @@ static int parse_fsopt_token(char *c, void *private) if (!fsopt->mds_namespace) return -ENOMEM; break; + case Opt_recover_session: + if (!strncmp(argstr[0].from, "no", + argstr[0].to - argstr[0].from)) { + fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER; + } else if (!strncmp(argstr[0].from, "clean", + argstr[0].to - argstr[0].from)) { + fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER; + } else { + return -EINVAL; + } + break; case Opt_fscache_uniq: kfree(fsopt->fscache_uniq); fsopt->fscache_uniq = kstrndup(argstr[0].from, @@ -576,6 +589,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->mds_namespace) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); + + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) + seq_show_option(m, "recover_session", "clean"); + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) @@ -664,6 +681,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->sb = NULL; fsc->mount_state = CEPH_MOUNT_MOUNTING; + fsc->filp_gen = 1; atomic_long_set(&fsc->writeback_count, 0); @@ -713,6 +731,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) { dout("destroy_fs_client %p\n", fsc); + ceph_mdsc_destroy(fsc); destroy_workqueue(fsc->inode_wq); destroy_workqueue(fsc->cap_wq); @@ -829,7 +848,7 @@ static void ceph_umount_begin(struct super_block *sb) fsc->mount_state = CEPH_MOUNT_SHUTDOWN; ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); ceph_mdsc_force_umount(fsc->mdsc); - return; + fsc->filp_gen++; // invalidate open files } static int ceph_remount(struct super_block *sb, int *flags, char *data) @@ -1089,7 +1108,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, } if (ceph_sb_to_client(sb) != fsc) { - ceph_mdsc_destroy(fsc); destroy_fs_client(fsc); fsc = ceph_sb_to_client(sb); dout("get_sb got existing client %p\n", fsc); @@ -1115,7 +1133,6 @@ out_splat: goto out_final; out: - ceph_mdsc_destroy(fsc); destroy_fs_client(fsc); out_final: dout("ceph_mount fail %ld\n", PTR_ERR(res)); @@ -1139,8 +1156,6 @@ static void ceph_kill_sb(struct super_block *s) ceph_fscache_unregister_fs(fsc); - ceph_mdsc_destroy(fsc); - destroy_fs_client(fsc); free_anon_bdev(dev); } @@ -1154,6 +1169,33 @@ static struct file_system_type ceph_fs_type = { }; MODULE_ALIAS_FS("ceph"); +int ceph_force_reconnect(struct super_block *sb) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + int err = 0; + + ceph_umount_begin(sb); + + /* Make sure all page caches get invalidated. + * see remove_session_caps_cb() */ + flush_workqueue(fsc->inode_wq); + + /* In case that we were blacklisted. This also reset + * all mon/osd connections */ + ceph_reset_client_addr(fsc->client); + + ceph_osdc_clear_abort_err(&fsc->client->osdc); + + fsc->blacklisted = false; + fsc->mount_state = CEPH_MOUNT_MOUNTED; + + if (sb->s_root) { + err = __ceph_do_getattr(d_inode(sb->s_root), NULL, + CEPH_STAT_CAP_INODE, true); + } + return err; +} + static int __init init_ceph(void) { int ret = init_caches(); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6b9f1ee7de85..f98d9247f9cb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -31,6 +32,7 @@ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) +#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ @@ -101,6 +103,11 @@ struct ceph_fs_client { struct ceph_client *client; unsigned long mount_state; + + unsigned long last_auto_reconnect; + bool blacklisted; + + u32 filp_gen; loff_t max_file_size; struct ceph_mds_client *mdsc; @@ -395,6 +402,8 @@ struct ceph_inode_info { struct fscache_cookie *fscache; u32 i_fscache_gen; #endif + errseq_t i_meta_err; + struct inode vfs_inode; /* at end */ }; @@ -499,17 +508,16 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ #define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ #define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */ -#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ -#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ -#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ -#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ -#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ -#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ -#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ -#define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ -#define CEPH_I_ERROR_FILELOCK (1 << 12) /* have seen file lock errors */ - +#define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */ +#define CEPH_I_POOL_RD (1 << 4) /* can read from pool */ +#define CEPH_I_POOL_WR (1 << 5) /* can write to pool */ +#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ +#define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */ +#define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */ +#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ +#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */ +#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */ +#define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */ /* * Masks of ceph inode work. @@ -703,6 +711,10 @@ struct ceph_file_info { spinlock_t rw_contexts_lock; struct list_head rw_contexts; + + errseq_t meta_err; + u32 filp_gen; + atomic_t num_locks; }; struct ceph_dir_file_info { @@ -842,7 +854,8 @@ static inline int default_congestion_kb(void) } - +/* super.c */ +extern int ceph_force_reconnect(struct super_block *sb); /* snap.c */ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino); @@ -959,7 +972,10 @@ static inline bool ceph_security_xattr_wanted(struct inode *in) #ifdef CONFIG_CEPH_FS_SECURITY_LABEL extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, struct ceph_acl_sec_ctx *ctx); -extern void ceph_security_invalidate_secctx(struct inode *inode); +static inline void ceph_security_invalidate_secctx(struct inode *inode) +{ + security_inode_invalidate_secctx(inode); +} #else static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, struct ceph_acl_sec_ctx *ctx) @@ -1039,7 +1055,6 @@ extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds); -extern int ceph_get_cap_mds(struct inode *inode); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, @@ -1058,9 +1073,9 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, struct inode *dir, int mds, int drop, int unless); -extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, +extern int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got, struct page **pinned_page); -extern int ceph_try_get_caps(struct ceph_inode_info *ci, +extern int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got); /* for counting open files by mode */ @@ -1071,7 +1086,7 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); extern const struct address_space_operations ceph_aops; extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); extern int ceph_uninline_data(struct file *filp, struct page *locked_page); -extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need); +extern int ceph_pool_perm_check(struct inode *inode, int need); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 939eab7aa219..cb18ee637cb7 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -20,7 +20,8 @@ static int __remove_xattr(struct ceph_inode_info *ci, static bool ceph_is_valid_xattr(const char *name) { - return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || + return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } @@ -892,7 +893,8 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, memcpy(value, xattr->val, xattr->val_len); if (current->journal_info && - !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + security_ismaclabel(name + XATTR_SECURITY_PREFIX_LEN)) ci->i_ceph_flags |= CEPH_I_SEC_INITED; out: spin_unlock(&ci->i_ceph_lock); @@ -903,11 +905,9 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); bool len_only = (size == 0); u32 namelen; int err; - int i; spin_lock(&ci->i_ceph_lock); dout("listxattr %p ver=%lld index_ver=%lld\n", inode, @@ -936,33 +936,6 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) names = __copy_xattr_names(ci, names); size -= namelen; } - - - /* virtual xattr names, too */ - if (vxattrs) { - for (i = 0; vxattrs[i].name; i++) { - size_t this_len; - - if (vxattrs[i].flags & VXATTR_FLAG_HIDDEN) - continue; - if (vxattrs[i].exists_cb && !vxattrs[i].exists_cb(ci)) - continue; - - this_len = strlen(vxattrs[i].name) + 1; - namelen += this_len; - if (len_only) - continue; - - if (this_len > size) { - err = -ERANGE; - goto out; - } - - memcpy(names, vxattrs[i].name, this_len); - names += this_len; - size -= this_len; - } - } err = namelen; out: spin_unlock(&ci->i_ceph_lock); @@ -1293,42 +1266,8 @@ out: ceph_pagelist_release(pagelist); return err; } - -void ceph_security_invalidate_secctx(struct inode *inode) -{ - security_inode_invalidate_secctx(inode); -} - -static int ceph_xattr_set_security_label(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *key, const void *buf, - size_t buflen, int flags) -{ - if (security_ismaclabel(key)) { - const char *name = xattr_full_name(handler, key); - return __ceph_setxattr(inode, name, buf, buflen, flags); - } - return -EOPNOTSUPP; -} - -static int ceph_xattr_get_security_label(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *key, void *buf, size_t buflen) -{ - if (security_ismaclabel(key)) { - const char *name = xattr_full_name(handler, key); - return __ceph_getxattr(inode, name, buf, buflen); - } - return -EOPNOTSUPP; -} - -static const struct xattr_handler ceph_security_label_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .get = ceph_xattr_get_security_label, - .set = ceph_xattr_set_security_label, -}; -#endif -#endif +#endif /* CONFIG_CEPH_FS_SECURITY_LABEL */ +#endif /* CONFIG_SECURITY */ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) { @@ -1351,9 +1290,6 @@ const struct xattr_handler *ceph_xattr_handlers[] = { #ifdef CONFIG_CEPH_FS_POSIX_ACL &posix_acl_access_xattr_handler, &posix_acl_default_xattr_handler, -#endif -#ifdef CONFIG_CEPH_FS_SECURITY_LABEL - &ceph_security_label_handler, #endif &ceph_other_xattr_handler, NULL, diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 82156da3c650..b9dbda1c26aa 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -293,6 +293,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private); struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); u64 ceph_client_gid(struct ceph_client *client); extern void ceph_destroy_client(struct ceph_client *client); +extern void ceph_reset_client_addr(struct ceph_client *client); extern int __ceph_open_session(struct ceph_client *client, unsigned long started); extern int ceph_open_session(struct ceph_client *client); diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 23895d178149..c4458dc6a757 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -337,6 +337,7 @@ extern void ceph_msgr_flush(void); extern void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr); extern void ceph_messenger_fini(struct ceph_messenger *msgr); +extern void ceph_messenger_reset_nonce(struct ceph_messenger *msgr); extern void ceph_con_init(struct ceph_connection *con, void *private, const struct ceph_connection_operations *ops, diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index b4d134d3312a..dbb8a6959a73 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -109,6 +109,7 @@ extern int ceph_monmap_contains(struct ceph_monmap *m, extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); extern void ceph_monc_stop(struct ceph_mon_client *monc); +extern void ceph_monc_reopen_session(struct ceph_mon_client *monc); enum { CEPH_SUB_MONMAP = 0, diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ad7fe5d10dcd..eaffbdddf89a 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -381,6 +381,7 @@ extern void ceph_osdc_cleanup(void); extern int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client); extern void ceph_osdc_stop(struct ceph_osd_client *osdc); +extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc); extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg); @@ -388,6 +389,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err); +void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc); #define osd_req_op_data(oreq, whch, typ, fld) \ ({ \ diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4eeea4d5c3ef..2d568246803f 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -185,18 +186,34 @@ int ceph_compare_options(struct ceph_options *new_opt, } EXPORT_SYMBOL(ceph_compare_options); +/* + * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are + * compatible with (a superset of) GFP_KERNEL. This is because while the + * actual pages are allocated with the specified flags, the page table pages + * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take + * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc(). + * + * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. + */ void *ceph_kvmalloc(size_t size, gfp_t flags) { - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *ptr = kmalloc(size, flags | __GFP_NOWARN); - if (ptr) - return ptr; + void *p; + + if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) { + p = kvmalloc(size, flags); + } else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) { + unsigned int nofs_flag = memalloc_nofs_save(); + p = kvmalloc(size, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + } else { + unsigned int noio_flag = memalloc_noio_save(); + p = kvmalloc(size, GFP_KERNEL); + memalloc_noio_restore(noio_flag); } - return __vmalloc(size, flags, PAGE_KERNEL); + return p; } - static int parse_fsid(const char *str, struct ceph_fsid *fsid) { int i = 0; @@ -694,6 +711,14 @@ void ceph_destroy_client(struct ceph_client *client) } EXPORT_SYMBOL(ceph_destroy_client); +void ceph_reset_client_addr(struct ceph_client *client) +{ + ceph_messenger_reset_nonce(&client->msgr); + ceph_monc_reopen_session(&client->monc); + ceph_osdc_reopen_osds(&client->osdc); +} +EXPORT_SYMBOL(ceph_reset_client_addr); + /* * true if we have the mon map (and have thus joined the cluster) */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 962f521c863e..e4cb3db2ee77 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3031,6 +3031,12 @@ static void con_fault(struct ceph_connection *con) } +void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) +{ + u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000; + msgr->inst.addr.nonce = cpu_to_le32(nonce); + encode_my_addr(msgr); +} /* * initialize a new messenger instance diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 0520bf9825aa..7256c402ebaa 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -213,6 +213,13 @@ static void reopen_session(struct ceph_mon_client *monc) __open_session(monc); } +void ceph_monc_reopen_session(struct ceph_mon_client *monc) +{ + mutex_lock(&monc->mutex); + reopen_session(monc); + mutex_unlock(&monc->mutex); +} + static void un_backoff(struct ceph_mon_client *monc) { monc->hunt_mult /= 2; /* reduce by 50% */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 78ae6e8c953d..ba45b074a362 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -841,6 +841,7 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, struct ceph_pagelist *pagelist; size_t payload_len = 0; size_t size; + int ret; op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); @@ -852,20 +853,27 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, size = strlen(class); BUG_ON(size > (size_t) U8_MAX); op->cls.class_len = size; - ceph_pagelist_append(pagelist, class, size); + ret = ceph_pagelist_append(pagelist, class, size); + if (ret) + goto err_pagelist_free; payload_len += size; op->cls.method_name = method; size = strlen(method); BUG_ON(size > (size_t) U8_MAX); op->cls.method_len = size; - ceph_pagelist_append(pagelist, method, size); + ret = ceph_pagelist_append(pagelist, method, size); + if (ret) + goto err_pagelist_free; payload_len += size; osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); - op->indata_len = payload_len; return 0; + +err_pagelist_free: + ceph_pagelist_release(pagelist); + return ret; } EXPORT_SYMBOL(osd_req_op_cls_init); @@ -877,6 +885,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, opcode, 0); struct ceph_pagelist *pagelist; size_t payload_len; + int ret; BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); @@ -886,10 +895,14 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, payload_len = strlen(name); op->xattr.name_len = payload_len; - ceph_pagelist_append(pagelist, name, payload_len); + ret = ceph_pagelist_append(pagelist, name, payload_len); + if (ret) + goto err_pagelist_free; op->xattr.value_len = size; - ceph_pagelist_append(pagelist, value, size); + ret = ceph_pagelist_append(pagelist, value, size); + if (ret) + goto err_pagelist_free; payload_len += size; op->xattr.cmp_op = cmp_op; @@ -898,6 +911,10 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); op->indata_len = payload_len; return 0; + +err_pagelist_free: + ceph_pagelist_release(pagelist); + return ret; } EXPORT_SYMBOL(osd_req_op_xattr_init); @@ -1488,7 +1505,6 @@ enum calc_target_result { static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_osd_request_target *t, - struct ceph_connection *con, bool any_change) { struct ceph_pg_pool_info *pi; @@ -2272,7 +2288,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); again: - ct_res = calc_target(osdc, &req->r_t, NULL, false); + ct_res = calc_target(osdc, &req->r_t, false); if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) goto promote; @@ -2476,6 +2492,14 @@ void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err) } EXPORT_SYMBOL(ceph_osdc_abort_requests); +void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc) +{ + down_write(&osdc->lock); + osdc->abort_err = 0; + up_write(&osdc->lock); +} +EXPORT_SYMBOL(ceph_osdc_clear_abort_err); + static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) { if (likely(eb > osdc->epoch_barrier)) { @@ -3087,7 +3111,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; } - calc_target(osdc, &lreq->t, NULL, false); + calc_target(osdc, &lreq->t, false); osd = lookup_create_osd(osdc, lreq->t.osd, true); link_linger(osd, lreq); @@ -3704,7 +3728,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq) struct ceph_osd_client *osdc = lreq->osdc; enum calc_target_result ct_res; - ct_res = calc_target(osdc, &lreq->t, NULL, true); + ct_res = calc_target(osdc, &lreq->t, true); if (ct_res == CALC_TARGET_NEED_RESEND) { struct ceph_osd *osd; @@ -3776,8 +3800,7 @@ static void scan_requests(struct ceph_osd *osd, n = rb_next(n); /* unlink_request(), check_pool_dne() */ dout("%s req %p tid %llu\n", __func__, req, req->r_tid); - ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con, - false); + ct_res = calc_target(osdc, &req->r_t, false); switch (ct_res) { case CALC_TARGET_NO_ACTION: force_resend_writes = cleared_full || @@ -3886,7 +3909,7 @@ static void kick_requests(struct ceph_osd_client *osdc, n = rb_next(n); if (req->r_t.epoch < osdc->osdmap->epoch) { - ct_res = calc_target(osdc, &req->r_t, NULL, false); + ct_res = calc_target(osdc, &req->r_t, false); if (ct_res == CALC_TARGET_POOL_DNE) { erase_request(need_resend, req); check_pool_dne(req); @@ -5086,6 +5109,24 @@ out_put_req: } EXPORT_SYMBOL(ceph_osdc_call); +/* + * reset all osd connections + */ +void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc) +{ + struct rb_node *n; + + down_write(&osdc->lock); + for (n = rb_first(&osdc->osds); n; ) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + + n = rb_next(n); + if (!reopen_osd(osd)) + kick_osd_requests(osd); + } + up_write(&osdc->lock); +} + /* * init, shutdown */ diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 90437906b7bc..4e0de14f80bb 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) struct ceph_pg_pool_info, node); __remove_pg_pool(&map->pg_pools, pi); } - kfree(map->osd_state); - kfree(map->osd_weight); - kfree(map->osd_addr); - kfree(map->osd_primary_affinity); - kfree(map->crush_workspace); + kvfree(map->osd_state); + kvfree(map->osd_weight); + kvfree(map->osd_addr); + kvfree(map->osd_primary_affinity); + kvfree(map->crush_workspace); kfree(map); } @@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) * * The new elements are properly initialized. */ -static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) +static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) { u32 *state; u32 *weight; struct ceph_entity_addr *addr; + u32 to_copy; int i; - state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); - if (!state) + dout("%s old %u new %u\n", __func__, map->max_osd, max); + if (max == map->max_osd) + return 0; + + state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); + weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); + addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); + if (!state || !weight || !addr) { + kvfree(state); + kvfree(weight); + kvfree(addr); return -ENOMEM; + } + + to_copy = min(map->max_osd, max); + if (map->osd_state) { + memcpy(state, map->osd_state, to_copy * sizeof(*state)); + memcpy(weight, map->osd_weight, to_copy * sizeof(*weight)); + memcpy(addr, map->osd_addr, to_copy * sizeof(*addr)); + kvfree(map->osd_state); + kvfree(map->osd_weight); + kvfree(map->osd_addr); + } + map->osd_state = state; - - weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); - if (!weight) - return -ENOMEM; map->osd_weight = weight; - - addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); - if (!addr) - return -ENOMEM; map->osd_addr = addr; - for (i = map->max_osd; i < max; i++) { map->osd_state[i] = 0; map->osd_weight[i] = CEPH_OSD_OUT; @@ -1017,12 +1030,16 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) if (map->osd_primary_affinity) { u32 *affinity; - affinity = krealloc(map->osd_primary_affinity, - max*sizeof(*affinity), GFP_NOFS); + affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)), + GFP_NOFS); if (!affinity) return -ENOMEM; - map->osd_primary_affinity = affinity; + memcpy(affinity, map->osd_primary_affinity, + to_copy * sizeof(*affinity)); + kvfree(map->osd_primary_affinity); + + map->osd_primary_affinity = affinity; for (i = map->max_osd; i < max; i++) map->osd_primary_affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; @@ -1043,7 +1060,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); dout("%s work_size %zu bytes\n", __func__, work_size); - workspace = kmalloc(work_size, GFP_NOIO); + workspace = ceph_kvmalloc(work_size, GFP_NOIO); if (!workspace) { crush_destroy(crush); return -ENOMEM; @@ -1052,7 +1069,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) if (map->crush) crush_destroy(map->crush); - kfree(map->crush_workspace); + kvfree(map->crush_workspace); map->crush = crush; map->crush_workspace = workspace; return 0; @@ -1298,9 +1315,9 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) if (!map->osd_primary_affinity) { int i; - map->osd_primary_affinity = kmalloc_array(map->max_osd, - sizeof(u32), - GFP_NOFS); + map->osd_primary_affinity = ceph_kvmalloc( + array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), + GFP_NOFS); if (!map->osd_primary_affinity) return -ENOMEM; @@ -1321,7 +1338,7 @@ static int decode_primary_affinity(void **p, void *end, ceph_decode_32_safe(p, end, len, e_inval); if (len == 0) { - kfree(map->osd_primary_affinity); + kvfree(map->osd_primary_affinity); map->osd_primary_affinity = NULL; return 0; }