From ed98adad3d87594c55347824e85137d1829c9e70 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 5 Jul 2010 12:15:14 -0700 Subject: [PATCH 1/8] ceph: fix message revocation A message can be on a queue (pending or sent), or out_msg (sending), or both. We were assuming that if it's not on a queue it couldn't be out_msg, but that was false in the case of lossy connections like the OSD. Fix ceph_con_revoke() to treat these cases independently. Also, fix the out_kvec_is_message check to only trigger if we are currently sending _this_ message. This fixes a GPF in tcp_sendpage, triggered by OSD restarts. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 9ad43a310a41..9692d08e2f88 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -2015,20 +2015,20 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) { mutex_lock(&con->mutex); if (!list_empty(&msg->list_head)) { - dout("con_revoke %p msg %p\n", con, msg); + dout("con_revoke %p msg %p - was on queue\n", con, msg); list_del_init(&msg->list_head); ceph_msg_put(msg); msg->hdr.seq = 0; - if (con->out_msg == msg) { - ceph_msg_put(con->out_msg); - con->out_msg = NULL; - } + } + if (con->out_msg == msg) { + dout("con_revoke %p msg %p - was sending\n", con, msg); + con->out_msg = NULL; if (con->out_kvec_is_msg) { con->out_skip = con->out_kvec_bytes; con->out_kvec_is_msg = false; } - } else { - dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg); + ceph_msg_put(msg); + msg->hdr.seq = 0; } mutex_unlock(&con->mutex); } From 22b1de06c9fe128ca3de72560c3e8c2cabf2927a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 5 Jul 2010 15:36:49 -0700 Subject: [PATCH 2/8] ceph: fix leak of mon authorizer Fix leak of a struct ceph_buffer on umount. Signed-off-by: Sage Weil --- fs/ceph/auth_x.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index 3fe49042d8ad..6d44053ecff1 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -613,6 +613,9 @@ static void ceph_x_destroy(struct ceph_auth_client *ac) remove_ticket_handler(ac, th); } + if (xi->auth_authorizer.buf) + ceph_buffer_put(xi->auth_authorizer.buf); + kfree(ac->private); ac->private = NULL; } From b0bbb0be8f7fbf6d366b359e034c78a96c4e274d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 8 Jul 2010 14:49:38 +0200 Subject: [PATCH 3/8] ceph: add kfree() to error path We leak a "pi" on this error path. Signed-off-by: Dan Carpenter Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 50ce64ebd330..277f8b339577 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -568,6 +568,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) if (ev > CEPH_PG_POOL_VERSION) { pr_warning("got unknown v %d > %d of ceph_pg_pool\n", ev, CEPH_PG_POOL_VERSION); + kfree(pi); goto bad; } __decode_pool(p, pi); From d06dbaf6c2c7187938f3f6745d9e4938a2d0ec47 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 8 Jul 2010 10:47:16 -0700 Subject: [PATCH 4/8] ceph: fix printing of ipv6 addrs The buffer was too small. Make it bigger, use snprintf(), put brackets around the ipv6 address to avoid mixing it up with the :port, and use the ever-so-handy %pI[46] formats. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 9692d08e2f88..e8c5a2d0e88f 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -43,7 +43,8 @@ static void ceph_fault(struct ceph_connection *con); * nicely render a sockaddr as a string. */ #define MAX_ADDR_STR 20 -static char addr_str[MAX_ADDR_STR][40]; +#define MAX_ADDR_STR_LEN 60 +static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; static DEFINE_SPINLOCK(addr_str_lock); static int last_addr_str; @@ -52,7 +53,6 @@ const char *pr_addr(const struct sockaddr_storage *ss) int i; char *s; struct sockaddr_in *in4 = (void *)ss; - unsigned char *quad = (void *)&in4->sin_addr.s_addr; struct sockaddr_in6 *in6 = (void *)ss; spin_lock(&addr_str_lock); @@ -64,25 +64,13 @@ const char *pr_addr(const struct sockaddr_storage *ss) switch (ss->ss_family) { case AF_INET: - sprintf(s, "%u.%u.%u.%u:%u", - (unsigned int)quad[0], - (unsigned int)quad[1], - (unsigned int)quad[2], - (unsigned int)quad[3], - (unsigned int)ntohs(in4->sin_port)); + snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr, + (unsigned int)ntohs(in4->sin_port)); break; case AF_INET6: - sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u", - in6->sin6_addr.s6_addr16[0], - in6->sin6_addr.s6_addr16[1], - in6->sin6_addr.s6_addr16[2], - in6->sin6_addr.s6_addr16[3], - in6->sin6_addr.s6_addr16[4], - in6->sin6_addr.s6_addr16[5], - in6->sin6_addr.s6_addr16[6], - in6->sin6_addr.s6_addr16[7], - (unsigned int)ntohs(in6->sin6_port)); + snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr, + (unsigned int)ntohs(in6->sin6_port)); break; default: From 39139f64e14684cf2370770deb79d929d27cfd9b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 8 Jul 2010 09:54:52 -0700 Subject: [PATCH 5/8] ceph: fix parsing of ipv6 addresses Check for brackets around the ipv6 address to avoid ambiguity with the port number. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index e8c5a2d0e88f..3ddef1556457 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -997,19 +997,32 @@ int ceph_parse_ips(const char *c, const char *end, struct sockaddr_in *in4 = (void *)ss; struct sockaddr_in6 *in6 = (void *)ss; int port; + char delim = ','; + + if (*p == '[') { + delim = ']'; + p++; + } memset(ss, 0, sizeof(*ss)); if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, - ',', &ipend)) { + delim, &ipend)) ss->ss_family = AF_INET; - } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, - ',', &ipend)) { + else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, + delim, &ipend)) ss->ss_family = AF_INET6; - } else { + else goto bad; - } p = ipend; + if (delim == ']') { + if (*p != ']') { + dout("missing matching ']'\n"); + goto bad; + } + p++; + } + /* port? */ if (p < end && *p == ':') { port = 0; @@ -1043,7 +1056,7 @@ int ceph_parse_ips(const char *c, const char *end, return 0; bad: - pr_err("parse_ips bad ip '%s'\n", c); + pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); return -EINVAL; } From f91d3471ccf1ca9a795f46c94b1ded8dd219940c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 1 Jul 2010 15:18:31 -0700 Subject: [PATCH 6/8] ceph: fix creation of ipv6 sockets Use the address family from the peer address instead of assuming IPv4. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 3ddef1556457..15167b2daa55 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -203,12 +203,13 @@ static void set_sock_callbacks(struct socket *sock, */ static struct socket *ceph_tcp_connect(struct ceph_connection *con) { - struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr; + struct sockaddr_storage *paddr = &con->peer_addr.in_addr; struct socket *sock; int ret; BUG_ON(con->sock); - ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &sock); if (ret) return ERR_PTR(ret); con->sock = sock; @@ -222,7 +223,8 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); - ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK); + ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), + O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", pr_addr(&con->peer_addr.in_addr), From 01a92f174f8a3b99dbb5e02c86e7ee1e576737af Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 15 Jul 2010 13:24:32 -0700 Subject: [PATCH 7/8] ceph: reuse request message when replaying against recovering mds Replayed rename operations (after an mds failure/recovery) were broken because the request paths were regenerated from the dentry names, which get mangled when d_move() is called. Instead, resend the previous request message when replaying completed operations. Just make sure the REPLAY flag is set and the target ino is filled in. This fixes problems with workloads doing renames when the MDS restarts, where the rename operation appears to succeed, but on mds restart then fails (leading to client confusion, app breakage, etc.). Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3ab79f6c4ce8..23332bc44515 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1580,6 +1580,27 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); + if (req->r_got_unsafe) { + /* + * Replay. Do not regenerate message (and rebuild + * paths, etc.); just use the original message. + * Rebuilding paths will break for renames because + * d_move mangles the src name. + */ + msg = req->r_request; + rhead = msg->front.iov_base; + + flags = le32_to_cpu(rhead->flags); + flags |= CEPH_MDS_FLAG_REPLAY; + rhead->flags = cpu_to_le32(flags); + + if (req->r_target_inode) + rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); + + rhead->num_retry = req->r_attempts - 1; + return 0; + } + if (req->r_request) { ceph_msg_put(req->r_request); req->r_request = NULL; @@ -1601,13 +1622,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, rhead->flags = cpu_to_le32(flags); rhead->num_fwd = req->r_num_fwd; rhead->num_retry = req->r_attempts - 1; + rhead->ino = 0; dout(" r_locked_dir = %p\n", req->r_locked_dir); - - if (req->r_target_inode && req->r_got_unsafe) - rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); - else - rhead->ino = 0; return 0; } From e979cf50395e24c4bdd489f60e2d5dd5ae66d255 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 15 Jul 2010 14:58:39 -0700 Subject: [PATCH 8/8] ceph: do not include cap/dentry releases in replayed messages Strip the cap and dentry releases from replayed messages. They can cause the shared state to get out of sync because they were generated (with the request message) earlier, and no longer reflect the current client state. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 8 ++++++++ fs/ceph/mds_client.h | 1 + 2 files changed, 9 insertions(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 23332bc44515..416c08d315db 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1514,6 +1514,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ceph_encode_filepath(&p, end, ino1, path1); ceph_encode_filepath(&p, end, ino2, path2); + /* make note of release offset, in case we need to replay */ + req->r_request_release_offset = p - msg->front.iov_base; + /* cap releases */ releases = 0; if (req->r_inode_drop) @@ -1598,6 +1601,11 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); rhead->num_retry = req->r_attempts - 1; + + /* remove cap/dentry releases from message */ + rhead->num_releases = 0; + msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); + msg->front.iov_len = req->r_request_release_offset; return 0; } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index b292fa42a66d..952410c60d09 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -188,6 +188,7 @@ struct ceph_mds_request { int r_old_inode_drop, r_old_inode_unless; struct ceph_msg *r_request; /* original request */ + int r_request_release_offset; struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; int r_err;