libceph: support for sending notifies

Implement ceph_osdc_notify() for sending notifies.

Due to the fact that the current messenger can't do read-in into
pagelists (it can only do write-out from them), I had to go with a page
vector for a NOTIFY_COMPLETE payload, for now.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
This commit is contained in:
Ilya Dryomov 2016-04-28 16:07:27 +02:00
Родитель 922dab6134
Коммит 1907920324
4 изменённых файлов: 249 добавлений и 11 удалений

Просмотреть файл

@ -114,6 +114,11 @@ struct ceph_osd_req_op {
struct {
struct ceph_osd_data request_data;
} notify_ack;
struct {
u64 cookie;
struct ceph_osd_data request_data;
struct ceph_osd_data response_data;
} notify;
struct {
u64 expected_object_size;
u64 expected_write_size;
@ -202,6 +207,7 @@ struct ceph_osd_linger_request {
struct ceph_osd_client *osdc;
u64 linger_id;
bool committed;
bool is_watch; /* watch or notify */
struct ceph_osd *osd;
struct ceph_osd_request *reg_req;
@ -220,14 +226,20 @@ struct ceph_osd_linger_request {
struct list_head scan_item;
struct completion reg_commit_wait;
struct completion notify_finish_wait;
int reg_commit_error;
int notify_finish_error;
int last_error;
u32 register_gen;
u64 notify_id;
rados_watchcb2_t wcb;
rados_watcherrcb_t errcb;
void *data;
struct page ***preply_pages;
size_t *preply_len;
};
struct ceph_osd_client {
@ -397,5 +409,13 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
u64 cookie,
void *payload,
size_t payload_len);
int ceph_osdc_notify(struct ceph_osd_client *osdc,
struct ceph_object_id *oid,
struct ceph_object_locator *oloc,
void *payload,
size_t payload_len,
u32 timeout,
struct page ***preply_pages,
size_t *preply_len);
#endif

Просмотреть файл

@ -476,6 +476,9 @@ struct ceph_osd_op {
__u8 op; /* CEPH_OSD_WATCH_OP_* */
__le32 gen; /* registration generation */
} __attribute__ ((packed)) watch;
struct {
__le64 cookie;
} __attribute__ ((packed)) notify;
struct {
__le64 offset, length;
__le64 src_offset;

Просмотреть файл

@ -206,8 +206,9 @@ static void dump_linger_request(struct seq_file *s,
seq_printf(s, "%llu\t", lreq->linger_id);
dump_target(s, &lreq->t);
seq_printf(s, "\t%u\t%s/%d\n", lreq->register_gen,
lreq->committed ? "C" : "", lreq->last_error);
seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen,
lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "",
lreq->last_error);
}
static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)

Просмотреть файл

@ -334,6 +334,10 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
case CEPH_OSD_OP_NOTIFY_ACK:
ceph_osd_data_release(&op->notify_ack.request_data);
break;
case CEPH_OSD_OP_NOTIFY:
ceph_osd_data_release(&op->notify.request_data);
ceph_osd_data_release(&op->notify.response_data);
break;
default:
break;
}
@ -845,6 +849,9 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
break;
case CEPH_OSD_OP_NOTIFY_ACK:
break;
case CEPH_OSD_OP_NOTIFY:
dst->notify.cookie = cpu_to_le64(src->notify.cookie);
break;
case CEPH_OSD_OP_SETALLOCHINT:
dst->alloc_hint.expected_object_size =
cpu_to_le64(src->alloc_hint.expected_object_size);
@ -1439,6 +1446,12 @@ static void setup_request_data(struct ceph_osd_request *req,
ceph_osdc_msg_data_add(req->r_reply,
&op->cls.response_data);
break;
case CEPH_OSD_OP_NOTIFY:
ceph_osdc_msg_data_add(msg,
&op->notify.request_data);
ceph_osdc_msg_data_add(req->r_reply,
&op->notify.response_data);
break;
}
data_len += op->indata_len;
@ -1771,6 +1784,7 @@ linger_alloc(struct ceph_osd_client *osdc)
RB_CLEAR_NODE(&lreq->osdc_node);
INIT_LIST_HEAD(&lreq->scan_item);
init_completion(&lreq->reg_commit_wait);
init_completion(&lreq->notify_finish_wait);
lreq->osdc = osdc;
target_init(&lreq->t);
@ -1934,6 +1948,7 @@ static void do_watch_notify(struct work_struct *w)
goto out;
}
WARN_ON(!lreq->is_watch);
dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
__func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
lwork->notify.payload_len);
@ -1997,6 +2012,24 @@ static void linger_commit_cb(struct ceph_osd_request *req)
linger_reg_commit_complete(lreq, req->r_result);
lreq->committed = true;
if (!lreq->is_watch) {
struct ceph_osd_data *osd_data =
osd_req_op_data(req, 0, notify, response_data);
void *p = page_address(osd_data->pages[0]);
WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY ||
osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
/* make note of the notify_id */
if (req->r_ops[0].outdata_len >= sizeof(u64)) {
lreq->notify_id = ceph_decode_64(&p);
dout("lreq %p notify_id %llu\n", lreq,
lreq->notify_id);
} else {
dout("lreq %p no notify_id\n", lreq);
}
}
mutex_unlock(&lreq->lock);
linger_put(lreq);
}
@ -2050,7 +2083,7 @@ static void send_linger(struct ceph_osd_linger_request *lreq)
req->r_mtime = lreq->mtime;
mutex_lock(&lreq->lock);
if (lreq->committed) {
if (lreq->is_watch && lreq->committed) {
WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
op->watch.cookie != lreq->linger_id);
op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
@ -2059,7 +2092,10 @@ static void send_linger(struct ceph_osd_linger_request *lreq)
op->watch.gen);
req->r_callback = linger_reconnect_cb;
} else {
WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
if (!lreq->is_watch)
lreq->notify_id = 0;
else
WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
dout("lreq %p register\n", lreq);
req->r_callback = linger_commit_cb;
}
@ -2147,7 +2183,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq)
*/
static void __linger_cancel(struct ceph_osd_linger_request *lreq)
{
if (lreq->ping_req->r_osd)
if (lreq->is_watch && lreq->ping_req->r_osd)
cancel_linger_request(lreq->ping_req);
if (lreq->reg_req->r_osd)
cancel_linger_request(lreq->reg_req);
@ -2174,6 +2210,15 @@ static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
return ret ?: lreq->reg_commit_error;
}
static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
{
int ret;
dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
return ret ?: lreq->notify_finish_error;
}
/*
* Timeout callback, called every N seconds. When 1 or more OSD
* requests has been active for more than N seconds, we send a keepalive
@ -2220,7 +2265,7 @@ static void handle_timeout(struct work_struct *work)
found = true;
mutex_lock(&lreq->lock);
if (lreq->committed && !lreq->last_error)
if (lreq->is_watch && lreq->committed && !lreq->last_error)
send_linger_ping(lreq);
mutex_unlock(&lreq->lock);
}
@ -3032,6 +3077,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc,
u8 proto_ver, opcode;
u64 cookie, notify_id;
u64 notifier_id = 0;
s32 return_code = 0;
void *payload = NULL;
u32 payload_len = 0;
@ -3049,7 +3095,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc,
}
if (le16_to_cpu(msg->hdr.version) >= 2)
p += 4; /* skip return_code */
ceph_decode_32_safe(&p, end, return_code, bad);
if (le16_to_cpu(msg->hdr.version) >= 3)
ceph_decode_64_safe(&p, end, notifier_id, bad);
@ -3063,13 +3109,38 @@ static void handle_watch_notify(struct ceph_osd_client *osdc,
}
mutex_lock(&lreq->lock);
dout("%s opcode %d cookie %llu lreq %p\n", __func__, opcode, cookie,
lreq);
dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
opcode, cookie, lreq, lreq->is_watch);
if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
if (!lreq->last_error) {
lreq->last_error = -ENOTCONN;
queue_watch_error(lreq);
}
} else if (!lreq->is_watch) {
/* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
if (lreq->notify_id && lreq->notify_id != notify_id) {
dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
lreq->notify_id, notify_id);
} else if (!completion_done(&lreq->notify_finish_wait)) {
struct ceph_msg_data *data =
list_first_entry_or_null(&msg->data,
struct ceph_msg_data,
links);
if (data) {
if (lreq->preply_pages) {
WARN_ON(data->type !=
CEPH_MSG_DATA_PAGES);
*lreq->preply_pages = data->pages;
*lreq->preply_len = data->length;
} else {
ceph_release_page_vector(data->pages,
calc_pages_for(0, data->length));
}
}
lreq->notify_finish_error = return_code;
complete_all(&lreq->notify_finish_wait);
}
} else {
/* CEPH_WATCH_EVENT_NOTIFY */
lwork = lwork_alloc(lreq, do_watch_notify);
@ -3241,6 +3312,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
if (!lreq)
return ERR_PTR(-ENOMEM);
lreq->is_watch = true;
lreq->wcb = wcb;
lreq->errcb = errcb;
lreq->data = data;
@ -3395,6 +3467,116 @@ out_put_req:
}
EXPORT_SYMBOL(ceph_osdc_notify_ack);
static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
u64 cookie, u32 prot_ver, u32 timeout,
void *payload, size_t payload_len)
{
struct ceph_osd_req_op *op;
struct ceph_pagelist *pl;
int ret;
op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
op->notify.cookie = cookie;
pl = kmalloc(sizeof(*pl), GFP_NOIO);
if (!pl)
return -ENOMEM;
ceph_pagelist_init(pl);
ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
ret |= ceph_pagelist_encode_32(pl, timeout);
ret |= ceph_pagelist_encode_32(pl, payload_len);
ret |= ceph_pagelist_append(pl, payload, payload_len);
if (ret) {
ceph_pagelist_release(pl);
return -ENOMEM;
}
ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
op->indata_len = pl->length;
return 0;
}
/*
* @timeout: in seconds
*
* @preply_{pages,len} are initialized both on success and error.
* The caller is responsible for:
*
* ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
*/
int ceph_osdc_notify(struct ceph_osd_client *osdc,
struct ceph_object_id *oid,
struct ceph_object_locator *oloc,
void *payload,
size_t payload_len,
u32 timeout,
struct page ***preply_pages,
size_t *preply_len)
{
struct ceph_osd_linger_request *lreq;
struct page **pages;
int ret;
WARN_ON(!timeout);
if (preply_pages) {
*preply_pages = NULL;
*preply_len = 0;
}
lreq = linger_alloc(osdc);
if (!lreq)
return -ENOMEM;
lreq->preply_pages = preply_pages;
lreq->preply_len = preply_len;
ceph_oid_copy(&lreq->t.base_oid, oid);
ceph_oloc_copy(&lreq->t.base_oloc, oloc);
lreq->t.flags = CEPH_OSD_FLAG_READ;
lreq->reg_req = alloc_linger_request(lreq);
if (!lreq->reg_req) {
ret = -ENOMEM;
goto out_put_lreq;
}
/* for notify_id */
pages = ceph_alloc_page_vector(1, GFP_NOIO);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
goto out_put_lreq;
}
down_write(&osdc->lock);
linger_register(lreq); /* before osd_req_op_* */
ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
timeout, payload, payload_len);
if (ret) {
linger_unregister(lreq);
up_write(&osdc->lock);
ceph_release_page_vector(pages, 1);
goto out_put_lreq;
}
ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
response_data),
pages, PAGE_SIZE, 0, false, true);
linger_submit(lreq);
up_write(&osdc->lock);
ret = linger_reg_commit_wait(lreq);
if (!ret)
ret = linger_notify_finish_wait(lreq);
else
dout("lreq %p failed to initiate notify %d\n", lreq, ret);
linger_cancel(lreq);
out_put_lreq:
linger_put(lreq);
return ret;
}
EXPORT_SYMBOL(ceph_osdc_notify);
/*
* Call all pending notify callbacks - for use after a watch is
* unregistered, to make sure no more callbacks for it will be invoked
@ -3693,19 +3875,51 @@ out_unlock_osdc:
return m;
}
/*
* TODO: switch to a msg-owned pagelist
*/
static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
{
struct ceph_msg *m;
int type = le16_to_cpu(hdr->type);
u32 front_len = le32_to_cpu(hdr->front_len);
u32 data_len = le32_to_cpu(hdr->data_len);
m = ceph_msg_new(type, front_len, GFP_NOIO, false);
if (!m)
return NULL;
if (data_len) {
struct page **pages;
struct ceph_osd_data osd_data;
pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
GFP_NOIO);
if (!pages) {
ceph_msg_put(m);
return NULL;
}
ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
false);
ceph_osdc_msg_data_add(m, &osd_data);
}
return m;
}
static struct ceph_msg *alloc_msg(struct ceph_connection *con,
struct ceph_msg_header *hdr,
int *skip)
{
struct ceph_osd *osd = con->private;
int type = le16_to_cpu(hdr->type);
int front = le32_to_cpu(hdr->front_len);
*skip = 0;
switch (type) {
case CEPH_MSG_OSD_MAP:
case CEPH_MSG_WATCH_NOTIFY:
return ceph_msg_new(type, front, GFP_NOFS, false);
return alloc_msg_with_page_vector(hdr);
case CEPH_MSG_OSD_OPREPLY:
return get_reply(con, hdr, skip);
default: