From 84d9509234c46481aded977193fcf23f892d715f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 25 May 2010 16:46:24 -0700 Subject: [PATCH 01/39] ceph: request FILE_LAZYIO cap when LAZY file mode is set Also clean up the file flags -> file mode -> wanted caps functions while we're at it. This resyncs this file with userspace. Signed-off-by: Sage Weil --- fs/ceph/ceph_fs.c | 52 +++++++++++++++++++++++------------------------ fs/ceph/ceph_fs.h | 3 ++- 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c index 79d76bc4303f..3ac6cc7c1156 100644 --- a/fs/ceph/ceph_fs.c +++ b/fs/ceph/ceph_fs.c @@ -29,46 +29,44 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) int ceph_flags_to_mode(int flags) { + int mode; + #ifdef O_DIRECTORY /* fixme */ if ((flags & O_DIRECTORY) == O_DIRECTORY) return CEPH_FILE_MODE_PIN; -#endif -#ifdef O_LAZY - if (flags & O_LAZY) - return CEPH_FILE_MODE_LAZY; #endif if ((flags & O_APPEND) == O_APPEND) flags |= O_WRONLY; - flags &= O_ACCMODE; - if ((flags & O_RDWR) == O_RDWR) - return CEPH_FILE_MODE_RDWR; - if ((flags & O_WRONLY) == O_WRONLY) - return CEPH_FILE_MODE_WR; - return CEPH_FILE_MODE_RD; + if ((flags & O_ACCMODE) == O_RDWR) + mode = CEPH_FILE_MODE_RDWR; + else if ((flags & O_ACCMODE) == O_WRONLY) + mode = CEPH_FILE_MODE_WR; + else + mode = CEPH_FILE_MODE_RD; + +#ifdef O_LAZY + if (flags & O_LAZY) + mode |= CEPH_FILE_MODE_LAZY; +#endif + + return mode; } int ceph_caps_for_mode(int mode) { - switch (mode) { - case CEPH_FILE_MODE_PIN: - return CEPH_CAP_PIN; - case CEPH_FILE_MODE_RD: - return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED | + int caps = CEPH_CAP_PIN; + + if (mode & CEPH_FILE_MODE_RD) + caps |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE; - case CEPH_FILE_MODE_RDWR: - return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED | - CEPH_CAP_FILE_EXCL | - CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | + if (mode & CEPH_FILE_MODE_WR) + caps |= CEPH_CAP_FILE_EXCL | CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; - case CEPH_FILE_MODE_WR: - return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED | - CEPH_CAP_FILE_EXCL | - CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | - CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | - CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; - } - return 0; + if (mode & CEPH_FILE_MODE_LAZY) + caps |= CEPH_CAP_FILE_LAZYIO; + + return caps; } diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index 2fa992eaf7da..c2e4c61a99a3 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h @@ -563,7 +563,8 @@ int ceph_flags_to_mode(int flags); CEPH_CAP_FILE_EXCL) #define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR) #define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ - CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN) + CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \ + CEPH_CAP_PIN) #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ CEPH_LOCK_IXATTR) From 8c6e9229fc1989cf263a6fcd4ff406d7f473f966 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 16 Apr 2010 09:53:43 -0700 Subject: [PATCH 02/39] ceph: add LAZYIO ioctl to mark a file description for lazy consistency Allow an application to mark a file descriptor for lazy file consistency semantics, allowing buffered reads and writes when multiple clients are accessing the same file. Signed-off-by: Sage Weil --- fs/ceph/ioctl.c | 24 ++++++++++++++++++++++++ fs/ceph/ioctl.h | 2 ++ 2 files changed, 26 insertions(+) diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index d085f07756b4..76e307d2aba1 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -143,6 +143,27 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) return 0; } +static long ceph_ioctl_lazyio(struct file *file) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + + if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { + spin_lock(&inode->i_lock); + ci->i_nr_by_mode[fi->fmode]--; + fi->fmode |= CEPH_FILE_MODE_LAZY; + ci->i_nr_by_mode[fi->fmode]++; + spin_unlock(&inode->i_lock); + dout("ioctl_layzio: file %p marked lazy\n", file); + + ceph_check_caps(ci, 0, NULL); + } else { + dout("ioctl_layzio: file %p already lazy\n", file); + } + return 0; +} + long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); @@ -155,6 +176,9 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case CEPH_IOC_GET_DATALOC: return ceph_ioctl_get_dataloc(file, (void __user *)arg); + + case CEPH_IOC_LAZYIO: + return ceph_ioctl_lazyio(file); } return -ENOTTY; } diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index 25e4f1a9d059..88451a3b6857 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -37,4 +37,6 @@ struct ceph_ioctl_dataloc { #define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ struct ceph_ioctl_dataloc) +#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) + #endif From 33caad324b88f75f42d836735d86feaafb3b40cf Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 26 May 2010 14:31:27 -0700 Subject: [PATCH 03/39] ceph: perform lazy writes when file mode and caps permit If we have marked a file as "lazy" (using the ceph ioctl), perform buffered writes when the MDS caps allow it. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 7 +++---- fs/ceph/file.c | 12 ++++++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b81be9a56487..1a70a3ebf013 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -337,8 +337,7 @@ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) } /* - * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else - * -1. + * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. */ static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq) { @@ -346,7 +345,7 @@ static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq) int mds = -1; struct rb_node *p; - /* prefer mds with WR|WRBUFFER|EXCL caps */ + /* prefer mds with WR|BUFFER|EXCL caps */ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); mds = cap->mds; @@ -831,7 +830,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) { int want = 0; int mode; - for (mode = 0; mode < 4; mode++) + for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++) if (ci->i_nr_by_mode[mode]) want |= ceph_caps_for_mode(mode); return want; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7c08698fad3e..85c86ed5f4c0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -807,11 +807,12 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; + struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; loff_t endoff = pos + iov->iov_len; - int got = 0; + int want, got = 0; int ret, err; if (ceph_snap(inode) != CEPH_NOSNAP) @@ -824,8 +825,11 @@ retry_snap: dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, inode->i_size); - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, - &got, endoff); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); if (ret < 0) goto out; @@ -833,7 +837,7 @@ retry_snap: inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, ceph_cap_string(got)); - if ((got & CEPH_CAP_FILE_BUFFER) == 0 || + if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, From 2962507ca204f886967e1a089d9bec206d427c22 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 27 May 2010 10:40:43 -0700 Subject: [PATCH 04/39] ceph: perform lazy reads when file mode and caps permit If the file mode is marked as "lazy," perform cached/buffered reads when the caps permit it. Adjust the rdcache_gen and invalidation logic accordingly so that we manage our cache based on the FILE_CACHE -or- FILE_LAZYIO cap bits. Signed-off-by: Sage Weil --- fs/ceph/addr.c | 2 +- fs/ceph/caps.c | 16 ++++++++++------ fs/ceph/file.c | 12 ++++++++---- fs/ceph/inode.c | 5 +++-- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d9c60b84949a..e00797eed29c 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -552,7 +552,7 @@ static void writepages_finish(struct ceph_osd_request *req, * page truncation thread, possibly losing some data that * raced its way in */ - if ((issued & CEPH_CAP_FILE_CACHE) == 0) + if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) generic_error_remove_page(inode->i_mapping, page); unlock_page(page); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 1a70a3ebf013..b28915d5f404 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -482,8 +482,8 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * Each time we receive FILE_CACHE anew, we increment * i_rdcache_gen. */ - if ((issued & CEPH_CAP_FILE_CACHE) && - (had & CEPH_CAP_FILE_CACHE) == 0) + if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) ci->i_rdcache_gen++; /* @@ -1509,11 +1509,13 @@ retry_locked: ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ ci->i_rdcache_gen && /* may have cached pages */ (file_wanted == 0 || /* no open files */ - (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */ + (revoking & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ !tried_invalidate) { dout("check_caps trying to invalidate on %p\n", inode); if (try_nonblocking_invalidate(inode) < 0) { - if (revoking & CEPH_CAP_FILE_CACHE) { + if (revoking & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_LAZYIO)) { dout("check_caps queuing invalidate\n"); queue_invalidate = 1; ci->i_rdcache_revoking = ci->i_rdcache_gen; @@ -2276,7 +2278,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, * try to invalidate (once). (If there are dirty buffers, we * will invalidate _after_ writeback.) */ - if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && + if (((cap->issued & ~newcaps) & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_LAZYIO)) && !ci->i_wrbuffer_ref) { if (try_nonblocking_invalidate(inode) == 0) { revoked_rdcache = 1; @@ -2374,7 +2377,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, writeback = 1; /* will delay ack */ else if (dirty & ~newcaps) check_caps = 1; /* initiate writeback in check_caps */ - else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 || + else if (((used & ~newcaps) & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_LAZYIO)) == 0 || revoked_rdcache) check_caps = 2; /* send revoke ack in check_caps */ cap->issued = newcaps; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 85c86ed5f4c0..2329244f427b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -740,28 +740,32 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *filp = iocb->ki_filp; + struct ceph_file_info *fi = filp->private_data; loff_t *ppos = &iocb->ki_pos; size_t len = iov->iov_len; struct inode *inode = filp->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); void *base = iov->iov_base; ssize_t ret; - int got = 0; + int want, got = 0; int checkeof = 0, read = 0; dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", inode, ceph_vinop(inode), pos, (unsigned)len, inode); again: __ceph_do_pending_vmtruncate(inode); - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, - &got, -1); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_CACHE; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); if (ret < 0) goto out; dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), pos, (unsigned)len, ceph_cap_string(got)); - if ((got & CEPH_CAP_FILE_CACHE) == 0 || + if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || (inode->i_sb->s_flags & MS_SYNCHRONOUS)) /* hmm, this isn't really async... */ diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 389f9dbd9949..5d893d31e399 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -442,8 +442,9 @@ int ceph_fill_file_size(struct inode *inode, int issued, * the file is either opened or mmaped */ if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| - CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| - CEPH_CAP_FILE_EXCL)) || + CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| + CEPH_CAP_FILE_EXCL| + CEPH_CAP_FILE_LAZYIO)) || mapping_mapped(inode->i_mapping) || __ceph_caps_file_wanted(ci)) { ci->i_truncate_pending++; From ee6b272b9c3447a78fa831e37b925aefd5587ec9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 10 Jun 2010 12:55:52 -0700 Subject: [PATCH 05/39] ceph: drop unused argument Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 +- fs/ceph/mds_client.c | 10 ++++------ fs/ceph/mds_client.h | 3 +-- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b28915d5f404..a5b5725931bf 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2717,7 +2717,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, * along for the mds (who clearly thinks we still have this * cap). */ - ceph_add_cap_releases(mdsc, session, -1); + ceph_add_cap_releases(mdsc, session); ceph_send_cap_releases(mdsc, session); goto done; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index dd440bd438a9..26a5368e91f2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1067,15 +1067,13 @@ static int trim_caps(struct ceph_mds_client *mdsc, * Called under s_mutex. */ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, - int extra) + struct ceph_mds_session *session) { struct ceph_msg *msg; struct ceph_mds_cap_release *head; int err = -ENOMEM; + int extra = mdsc->client->mount_args->cap_release_safety; - if (extra < 0) - extra = mdsc->client->mount_args->cap_release_safety; spin_lock(&session->s_cap_lock); @@ -2005,7 +2003,7 @@ out_err: } mutex_unlock(&mdsc->mutex); - ceph_add_cap_releases(mdsc, req->r_session, -1); + ceph_add_cap_releases(mdsc, req->r_session); mutex_unlock(&session->s_mutex); /* kick calling process */ @@ -2715,7 +2713,7 @@ static void delayed_work(struct work_struct *work) send_renew_caps(mdsc, s); else ceph_con_keepalive(&s->s_con); - ceph_add_cap_releases(mdsc, s, -1); + ceph_add_cap_releases(mdsc, s); if (s->s_state == CEPH_MDS_SESSION_OPEN || s->s_state == CEPH_MDS_SESSION_HUNG) ceph_send_cap_releases(mdsc, s); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 952410c60d09..e389902db131 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -324,8 +324,7 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) } extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, - int extra); + struct ceph_mds_session *session); extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); From 38e8883ee31667d901feb9106f4863af35948c91 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 10 Jun 2010 12:57:11 -0700 Subject: [PATCH 06/39] ceph: simplify add_cap_releases No functional change, aside from more useful debug output. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 26a5368e91f2..774407f96ff1 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1069,11 +1069,14 @@ static int trim_caps(struct ceph_mds_client *mdsc, int ceph_add_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { - struct ceph_msg *msg; + struct ceph_msg *msg, *partial = NULL; struct ceph_mds_cap_release *head; int err = -ENOMEM; int extra = mdsc->client->mount_args->cap_release_safety; + int num; + dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, + extra); spin_lock(&session->s_cap_lock); @@ -1082,9 +1085,14 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, struct ceph_msg, list_head); head = msg->front.iov_base; - extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num); + num = le32_to_cpu(head->num); + if (num) { + dout(" partial %p with (%d/%d)\n", msg, num, + (int)CEPH_CAPS_PER_RELEASE); + extra += CEPH_CAPS_PER_RELEASE - num; + partial = msg; + } } - while (session->s_num_cap_releases < session->s_nr_caps + extra) { spin_unlock(&session->s_cap_lock); msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, @@ -1101,19 +1109,14 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; } - if (!list_empty(&session->s_cap_releases)) { - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, - list_head); - head = msg->front.iov_base; - if (head->num) { - dout(" queueing non-full %p (%d)\n", msg, - le32_to_cpu(head->num)); - list_move_tail(&msg->list_head, - &session->s_cap_releases_done); - session->s_num_cap_releases -= - CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num); - } + if (partial) { + head = partial->front.iov_base; + num = le32_to_cpu(head->num); + dout(" queueing partial %p with %d/%d\n", partial, num, + (int)CEPH_CAPS_PER_RELEASE); + list_move_tail(&partial->list_head, + &session->s_cap_releases_done); + session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num; } err = 0; spin_unlock(&session->s_cap_lock); From 3b454c4945c756686e91d77eeefac80cb5d21baf Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 10 Jun 2010 13:20:33 -0700 Subject: [PATCH 07/39] ceph: simplify caps revocation, fix for multimds The caps revocation should either initiate writeback, invalidateion, or call check_caps to ack or do the dirty work. The primary question is whether we can get away with only checking the auth cap or whether all caps need to be checked. The old code was doing...something else. At the very least, revocations from non-auth MDSs could break by triggering the "check auth cap only" case. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a5b5725931bf..82ffde618af0 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2278,8 +2278,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, * try to invalidate (once). (If there are dirty buffers, we * will invalidate _after_ writeback.) */ - if (((cap->issued & ~newcaps) & (CEPH_CAP_FILE_CACHE| - CEPH_CAP_FILE_LAZYIO)) && + if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !ci->i_wrbuffer_ref) { if (try_nonblocking_invalidate(inode) == 0) { revoked_rdcache = 1; @@ -2371,16 +2371,22 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, /* revocation, grant, or no-op? */ if (cap->issued & ~newcaps) { - dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued), - ceph_cap_string(newcaps)); - if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER) - writeback = 1; /* will delay ack */ - else if (dirty & ~newcaps) - check_caps = 1; /* initiate writeback in check_caps */ - else if (((used & ~newcaps) & (CEPH_CAP_FILE_CACHE| - CEPH_CAP_FILE_LAZYIO)) == 0 || - revoked_rdcache) - check_caps = 2; /* send revoke ack in check_caps */ + int revoking = cap->issued & ~newcaps; + + dout("revocation: %s -> %s (revoking %s)\n", + ceph_cap_string(cap->issued), + ceph_cap_string(newcaps), + ceph_cap_string(revoking)); + if (revoking & CEPH_CAP_FILE_BUFFER) + writeback = 1; /* initiate writeback; will delay ack */ + else if (revoking == CEPH_CAP_FILE_CACHE && + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && + queue_invalidate) + ; /* do nothing yet, invalidation will be queued */ + else if (cap == ci->i_auth_cap) + check_caps = 1; /* check auth cap only */ + else + check_caps = 2; /* check all caps */ cap->issued = newcaps; cap->implemented |= newcaps; } else if (cap->issued == newcaps) { From ca81f3f6bd759f90a4b940cddda1f8bc61a7725a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 10 Jun 2010 14:21:36 -0700 Subject: [PATCH 08/39] ceph: skip if no auth cap in flush_snaps If we have a capsnap but no auth cap (e.g. because it is migrating to another mds), bail out and do nothing for now. Do NOT remove the capsnap from the flush list. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 82ffde618af0..dbf0d6a02a77 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -339,7 +339,7 @@ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) /* * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. */ -static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq) +static int __ceph_get_cap_mds(struct ceph_inode_info *ci) { struct ceph_cap *cap; int mds = -1; @@ -349,8 +349,6 @@ static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq) for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); mds = cap->mds; - if (mseq) - *mseq = cap->mseq; if (cap->issued & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_EXCL)) @@ -363,7 +361,7 @@ int ceph_get_cap_mds(struct inode *inode) { int mds; spin_lock(&inode->i_lock); - mds = __ceph_get_cap_mds(ceph_inode(inode), NULL); + mds = __ceph_get_cap_mds(ceph_inode(inode)); spin_unlock(&inode->i_lock); return mds; } @@ -1231,7 +1229,13 @@ retry: BUG_ON(capsnap->dirty == 0); /* pick mds, take s_mutex */ - mds = __ceph_get_cap_mds(ci, &mseq); + if (ci->i_auth_cap == NULL) { + dout("no auth cap (migrating?), doing nothing\n"); + goto out; + } + mds = ci->i_auth_cap->session->s_mds; + mseq = ci->i_auth_cap->mseq; + if (session && session->s_mds != mds) { dout("oops, wrong session %p mutex\n", session); mutex_unlock(&session->s_mutex); @@ -1250,8 +1254,8 @@ retry: } /* * if session == NULL, we raced against a cap - * deletion. retry, and we'll get a better - * @mds value next time. + * deletion or migration. retry, and we'll + * get a better @mds value next time. */ spin_lock(&inode->i_lock); goto retry; @@ -1289,6 +1293,7 @@ retry: list_del_init(&ci->i_snap_flush_item); spin_unlock(&mdsc->snap_flush_lock); +out: if (psession) *psession = session; else if (session) { From cd84db6e4051a9fb7941d49d31a0193a3371fd61 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 11 Jun 2010 16:58:48 -0700 Subject: [PATCH 09/39] ceph: code cleanup Mainly fixing minor issues reported by sparse. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/armor.c | 6 +++++- fs/ceph/auth.c | 6 +++--- fs/ceph/auth_x.c | 6 +++--- fs/ceph/buffer.c | 16 ---------------- fs/ceph/caps.c | 6 +++--- fs/ceph/crypto.c | 27 +++++++++++++++------------ fs/ceph/crypto.h | 4 ++-- fs/ceph/decode.h | 6 ++++-- fs/ceph/dir.c | 2 ++ fs/ceph/file.c | 4 ++-- fs/ceph/messenger.c | 8 ++++---- fs/ceph/osdmap.c | 2 +- fs/ceph/xattr.c | 2 ++ 13 files changed, 46 insertions(+), 49 deletions(-) diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c index 67b2c030924b..eb2a666b0be7 100644 --- a/fs/ceph/armor.c +++ b/fs/ceph/armor.c @@ -1,11 +1,15 @@ #include +int ceph_armor(char *dst, const char *src, const char *end); +int ceph_unarmor(char *dst, const char *src, const char *end); + /* * base64 encode/decode. */ -const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +static const char *pem_key = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; static int encode_bits(int c) { diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c index 89490beaf537..6d2e30600627 100644 --- a/fs/ceph/auth.c +++ b/fs/ceph/auth.c @@ -20,7 +20,7 @@ static u32 supported_protocols[] = { CEPH_AUTH_CEPHX }; -int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol) +static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol) { switch (protocol) { case CEPH_AUTH_NONE: @@ -133,8 +133,8 @@ bad: return -ERANGE; } -int ceph_build_auth_request(struct ceph_auth_client *ac, - void *msg_buf, size_t msg_len) +static int ceph_build_auth_request(struct ceph_auth_client *ac, + void *msg_buf, size_t msg_len) { struct ceph_mon_request_header *monhdr = msg_buf; void *p = monhdr + 1; diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index 6d44053ecff1..582e0b2caf8a 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -87,8 +87,8 @@ static int ceph_x_decrypt(struct ceph_crypto_key *secret, /* * get existing (or insert new) ticket handler */ -struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac, - int service) +static struct ceph_x_ticket_handler * +get_ticket_handler(struct ceph_auth_client *ac, int service) { struct ceph_x_ticket_handler *th; struct ceph_x_info *xi = ac->private; @@ -429,7 +429,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, auth->struct_v = 1; auth->key = 0; for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++) - auth->key ^= *u; + auth->key ^= *(__le64 *)u; dout(" server_challenge %llx client_challenge %llx key %llx\n", xi->server_challenge, le64_to_cpu(auth->client_challenge), le64_to_cpu(auth->key)); diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c index c67535d70aa6..cd39f17021de 100644 --- a/fs/ceph/buffer.c +++ b/fs/ceph/buffer.c @@ -47,22 +47,6 @@ void ceph_buffer_release(struct kref *kref) kfree(b); } -int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp) -{ - b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN); - if (b->vec.iov_base) { - b->is_vmalloc = false; - } else { - b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL); - b->is_vmalloc = true; - } - if (!b->vec.iov_base) - return -ENOMEM; - b->alloc_len = len; - b->vec.iov_len = len; - return 0; -} - int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end) { size_t len; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index dbf0d6a02a77..d992880d21d4 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1194,6 +1194,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, */ void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession) + __releases(ci->vfs_inode->i_lock) + __acquires(ci->vfs_inode->i_lock) { struct inode *inode = &ci->vfs_inode; int mds; @@ -1439,7 +1441,6 @@ static int try_nonblocking_invalidate(struct inode *inode) */ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session) - __releases(session->s_mutex) { struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); struct ceph_mds_client *mdsc = &client->mdsc; @@ -2256,8 +2257,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, struct ceph_mds_session *session, struct ceph_cap *cap, struct ceph_buffer *xattr_buf) - __releases(inode->i_lock) - __releases(session->s_mutex) + __releases(inode->i_lock) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index f704b3b62424..c9318278f419 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -75,10 +75,11 @@ static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void) return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); } -const u8 *aes_iv = "cephsageyudagreg"; +static const u8 *aes_iv = "cephsageyudagreg"; -int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len, - const void *src, size_t src_len) +static int ceph_aes_encrypt(const void *key, int key_len, + void *dst, size_t *dst_len, + const void *src, size_t src_len) { struct scatterlist sg_in[2], sg_out[1]; struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); @@ -126,9 +127,10 @@ int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len, return 0; } -int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len, - const void *src1, size_t src1_len, - const void *src2, size_t src2_len) +static int ceph_aes_encrypt2(const void *key, int key_len, void *dst, + size_t *dst_len, + const void *src1, size_t src1_len, + const void *src2, size_t src2_len) { struct scatterlist sg_in[3], sg_out[1]; struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); @@ -179,8 +181,9 @@ int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len, return 0; } -int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len, - const void *src, size_t src_len) +static int ceph_aes_decrypt(const void *key, int key_len, + void *dst, size_t *dst_len, + const void *src, size_t src_len) { struct scatterlist sg_in[1], sg_out[2]; struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); @@ -238,10 +241,10 @@ int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len, return 0; } -int ceph_aes_decrypt2(const void *key, int key_len, - void *dst1, size_t *dst1_len, - void *dst2, size_t *dst2_len, - const void *src, size_t src_len) +static int ceph_aes_decrypt2(const void *key, int key_len, + void *dst1, size_t *dst1_len, + void *dst2, size_t *dst2_len, + const void *src, size_t src_len) { struct scatterlist sg_in[1], sg_out[3]; struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h index 40b502e6bd89..bdf38607323c 100644 --- a/fs/ceph/crypto.h +++ b/fs/ceph/crypto.h @@ -42,7 +42,7 @@ extern int ceph_encrypt2(struct ceph_crypto_key *secret, const void *src2, size_t src2_len); /* armor.c */ -extern int ceph_armor(char *dst, const void *src, const void *end); -extern int ceph_unarmor(void *dst, const char *src, const char *end); +extern int ceph_armor(char *dst, const char *src, const char *end); +extern int ceph_unarmor(char *dst, const char *src, const char *end); #endif diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h index 65b3e022eaf5..3d25415afe63 100644 --- a/fs/ceph/decode.h +++ b/fs/ceph/decode.h @@ -99,11 +99,13 @@ static inline void ceph_encode_timespec(struct ceph_timespec *tv, */ static inline void ceph_encode_addr(struct ceph_entity_addr *a) { - a->in_addr.ss_family = htons(a->in_addr.ss_family); + __be16 ss_family = htons(a->in_addr.ss_family); + a->in_addr.ss_family = *(__u16 *)&ss_family; } static inline void ceph_decode_addr(struct ceph_entity_addr *a) { - a->in_addr.ss_family = ntohs(a->in_addr.ss_family); + __be16 ss_family = *(__be16 *)&a->in_addr.ss_family; + a->in_addr.ss_family = ntohs(ss_family); WARN_ON(a->in_addr.ss_family == 512); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index f94ed3c7f6a5..0ec6e67533ef 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -94,6 +94,8 @@ static unsigned fpos_off(loff_t p) */ static int __dcache_readdir(struct file *filp, void *dirent, filldir_t filldir) + __releases(inode->i_lock) + __acquires(inode->i_lock) { struct inode *inode = filp->f_dentry->d_inode; struct ceph_file_info *fi = filp->private_data; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 2329244f427b..5fe50ffa2deb 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -317,7 +317,7 @@ void ceph_release_page_vector(struct page **pages, int num_pages) /* * allocate a vector new pages */ -struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) +static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) { struct page **pages; int i; @@ -745,7 +745,7 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, size_t len = iov->iov_len; struct inode *inode = filp->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - void *base = iov->iov_base; + void __user *base = iov->iov_base; ssize_t ret; int want, got = 0; int checkeof = 0, read = 0; diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 15167b2daa55..71263a2a78d7 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -108,7 +108,7 @@ void ceph_msgr_exit(void) destroy_workqueue(ceph_msgr_wq); } -void ceph_msgr_flush() +void ceph_msgr_flush(void) { flush_workqueue(ceph_msgr_wq); } @@ -1081,11 +1081,11 @@ static int process_banner(struct ceph_connection *con) sizeof(con->peer_addr)) != 0 && !(addr_is_blank(&con->actual_peer_addr.in_addr) && con->actual_peer_addr.nonce == con->peer_addr.nonce)) { - pr_warning("wrong peer, want %s/%lld, got %s/%lld\n", + pr_warning("wrong peer, want %s/%d, got %s/%d\n", pr_addr(&con->peer_addr.in_addr), - le64_to_cpu(con->peer_addr.nonce), + (int)le32_to_cpu(con->peer_addr.nonce), pr_addr(&con->actual_peer_addr.in_addr), - le64_to_cpu(con->actual_peer_addr.nonce)); + (int)le32_to_cpu(con->actual_peer_addr.nonce)); con->error_msg = "wrong peer at address"; return -1; } diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 416d46adbf87..46b391d8e86c 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -424,7 +424,7 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) kfree(pi); } -void __decode_pool(void **p, struct ceph_pg_pool_info *pi) +static void __decode_pool(void **p, struct ceph_pg_pool_info *pi) { ceph_decode_copy(p, &pi->v, sizeof(pi->v)); calc_pg_masks(pi); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 68aeebc69681..097a2654c00f 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -337,6 +337,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci) } static int __build_xattrs(struct inode *inode) + __releases(inode->i_lock) + __acquires(inode->i_lock) { u32 namelen; u32 numattr = 0; From 0deb01c9998f8112c5e478e3fe3a930131abbc0a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 17 Jun 2010 14:19:01 -0700 Subject: [PATCH 10/39] ceph: track laggy state of mds from mdsmap Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 4 +++- fs/ceph/mdsmap.c | 6 +++++- fs/ceph/mdsmap.h | 8 ++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 774407f96ff1..6e40db2a0014 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2377,9 +2377,11 @@ static void check_new_map(struct ceph_mds_client *mdsc, oldstate = ceph_mdsmap_get_state(oldmap, i); newstate = ceph_mdsmap_get_state(newmap, i); - dout("check_new_map mds%d state %s -> %s (session %s)\n", + dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n", i, ceph_mds_state_name(oldstate), + ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", ceph_mds_state_name(newstate), + ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", session_state_name(s->s_state)); if (memcmp(ceph_mdsmap_get_addr(oldmap, i), diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index c4c498e6dfef..040be6d1150b 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -85,6 +85,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) struct ceph_entity_addr addr; u32 num_export_targets; void *pexport_targets = NULL; + struct ceph_timespec laggy_since; ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); global_id = ceph_decode_64(p); @@ -103,7 +104,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) state_seq = ceph_decode_64(p); ceph_decode_copy(p, &addr, sizeof(addr)); ceph_decode_addr(&addr); - *p += sizeof(struct ceph_timespec); + ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); *p += namelen; @@ -122,6 +123,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_info[mds].global_id = global_id; m->m_info[mds].state = state; m->m_info[mds].addr = addr; + m->m_info[mds].laggy = + (laggy_since.tv_sec != 0 || + laggy_since.tv_nsec != 0); m->m_info[mds].num_export_targets = num_export_targets; if (num_export_targets) { m->m_info[mds].export_targets = diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h index eacc131aa5cb..4c5cb0880bba 100644 --- a/fs/ceph/mdsmap.h +++ b/fs/ceph/mdsmap.h @@ -13,6 +13,7 @@ struct ceph_mds_info { struct ceph_entity_addr addr; s32 state; int num_export_targets; + bool laggy; u32 *export_targets; }; @@ -47,6 +48,13 @@ static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) return m->m_info[w].state; } +static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) +{ + if (w >= 0 && w < m->m_max_mds) + return m->m_info[w].laggy; + return false; +} + extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m); extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end); extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m); From 37151668bad3fd058368752bee476f2ba3645596 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 17 Jun 2010 16:16:12 -0700 Subject: [PATCH 11/39] ceph: do caps accounting per mds_client Caps related accounting is now being done per mds client instead of just being global. This prepares ground work for a later revision of the caps preallocated reservation list. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/caps.c | 187 ++++++++++++++++++++----------------------- fs/ceph/mds_client.c | 16 ++-- fs/ceph/mds_client.h | 22 +++++ fs/ceph/super.c | 6 -- fs/ceph/super.h | 15 ++-- 5 files changed, 131 insertions(+), 115 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d992880d21d4..47068b10baf8 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -113,58 +113,41 @@ const char *ceph_cap_string(int caps) return cap_str[i]; } -/* - * Cap reservations - * - * Maintain a global pool of preallocated struct ceph_caps, referenced - * by struct ceph_caps_reservations. This ensures that we preallocate - * memory needed to successfully process an MDS response. (If an MDS - * sends us cap information and we fail to process it, we will have - * problems due to the client and MDS being out of sync.) - * - * Reservations are 'owned' by a ceph_cap_reservation context. - */ -static spinlock_t caps_list_lock; -static struct list_head caps_list; /* unused (reserved or unreserved) */ -static int caps_total_count; /* total caps allocated */ -static int caps_use_count; /* in use */ -static int caps_reserve_count; /* unused, reserved */ -static int caps_avail_count; /* unused, unreserved */ -static int caps_min_count; /* keep at least this many (unreserved) */ - -void __init ceph_caps_init(void) +void ceph_caps_init(struct ceph_mds_client *mdsc) { - INIT_LIST_HEAD(&caps_list); - spin_lock_init(&caps_list_lock); + INIT_LIST_HEAD(&mdsc->caps_list); + spin_lock_init(&mdsc->caps_list_lock); } -void ceph_caps_finalize(void) +void ceph_caps_finalize(struct ceph_mds_client *mdsc) { struct ceph_cap *cap; - spin_lock(&caps_list_lock); - while (!list_empty(&caps_list)) { - cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); + spin_lock(&mdsc->caps_list_lock); + while (!list_empty(&mdsc->caps_list)) { + cap = list_first_entry(&mdsc->caps_list, + struct ceph_cap, caps_item); list_del(&cap->caps_item); kmem_cache_free(ceph_cap_cachep, cap); } - caps_total_count = 0; - caps_avail_count = 0; - caps_use_count = 0; - caps_reserve_count = 0; - caps_min_count = 0; - spin_unlock(&caps_list_lock); + mdsc->caps_total_count = 0; + mdsc->caps_avail_count = 0; + mdsc->caps_use_count = 0; + mdsc->caps_reserve_count = 0; + mdsc->caps_min_count = 0; + spin_unlock(&mdsc->caps_list_lock); } -void ceph_adjust_min_caps(int delta) +void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) { - spin_lock(&caps_list_lock); - caps_min_count += delta; - BUG_ON(caps_min_count < 0); - spin_unlock(&caps_list_lock); + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_min_count += delta; + BUG_ON(mdsc->caps_min_count < 0); + spin_unlock(&mdsc->caps_list_lock); } -int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need) +int ceph_reserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx, int need) { int i; struct ceph_cap *cap; @@ -176,16 +159,17 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need) dout("reserve caps ctx=%p need=%d\n", ctx, need); /* first reserve any caps that are already allocated */ - spin_lock(&caps_list_lock); - if (caps_avail_count >= need) + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_avail_count >= need) have = need; else - have = caps_avail_count; - caps_avail_count -= have; - caps_reserve_count += have; - BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + - caps_avail_count); - spin_unlock(&caps_list_lock); + have = mdsc->caps_avail_count; + mdsc->caps_avail_count -= have; + mdsc->caps_reserve_count += have; + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); for (i = have; i < need; i++) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); @@ -198,19 +182,20 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need) } BUG_ON(have + alloc != need); - spin_lock(&caps_list_lock); - caps_total_count += alloc; - caps_reserve_count += alloc; - list_splice(&newcaps, &caps_list); + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_total_count += alloc; + mdsc->caps_reserve_count += alloc; + list_splice(&newcaps, &mdsc->caps_list); - BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + - caps_avail_count); - spin_unlock(&caps_list_lock); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); ctx->count = need; dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", - ctx, caps_total_count, caps_use_count, caps_reserve_count, - caps_avail_count); + ctx, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); return 0; out_alloc_count: @@ -220,26 +205,29 @@ out_alloc_count: return ret; } -int ceph_unreserve_caps(struct ceph_cap_reservation *ctx) +int ceph_unreserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) { dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); if (ctx->count) { - spin_lock(&caps_list_lock); - BUG_ON(caps_reserve_count < ctx->count); - caps_reserve_count -= ctx->count; - caps_avail_count += ctx->count; + spin_lock(&mdsc->caps_list_lock); + BUG_ON(mdsc->caps_reserve_count < ctx->count); + mdsc->caps_reserve_count -= ctx->count; + mdsc->caps_avail_count += ctx->count; ctx->count = 0; dout("unreserve caps %d = %d used + %d resv + %d avail\n", - caps_total_count, caps_use_count, caps_reserve_count, - caps_avail_count); - BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + - caps_avail_count); - spin_unlock(&caps_list_lock); + mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); } return 0; } -static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx) +static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) { struct ceph_cap *cap = NULL; @@ -247,71 +235,74 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx) if (!ctx) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); if (cap) { - caps_use_count++; - caps_total_count++; + mdsc->caps_use_count++; + mdsc->caps_total_count++; } return cap; } - spin_lock(&caps_list_lock); + spin_lock(&mdsc->caps_list_lock); dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", - ctx, ctx->count, caps_total_count, caps_use_count, - caps_reserve_count, caps_avail_count); + ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); BUG_ON(!ctx->count); - BUG_ON(ctx->count > caps_reserve_count); - BUG_ON(list_empty(&caps_list)); + BUG_ON(ctx->count > mdsc->caps_reserve_count); + BUG_ON(list_empty(&mdsc->caps_list)); ctx->count--; - caps_reserve_count--; - caps_use_count++; + mdsc->caps_reserve_count--; + mdsc->caps_use_count++; - cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); + cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); - BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + - caps_avail_count); - spin_unlock(&caps_list_lock); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); return cap; } -void ceph_put_cap(struct ceph_cap *cap) +void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) { - spin_lock(&caps_list_lock); + spin_lock(&mdsc->caps_list_lock); dout("put_cap %p %d = %d used + %d resv + %d avail\n", - cap, caps_total_count, caps_use_count, - caps_reserve_count, caps_avail_count); - caps_use_count--; + cap, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + mdsc->caps_use_count--; /* * Keep some preallocated caps around (ceph_min_count), to * avoid lots of free/alloc churn. */ - if (caps_avail_count >= caps_reserve_count + caps_min_count) { - caps_total_count--; + if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + + mdsc->caps_min_count) { + mdsc->caps_total_count--; kmem_cache_free(ceph_cap_cachep, cap); } else { - caps_avail_count++; - list_add(&cap->caps_item, &caps_list); + mdsc->caps_avail_count++; + list_add(&cap->caps_item, &mdsc->caps_list); } - BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + - caps_avail_count); - spin_unlock(&caps_list_lock); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); } void ceph_reservation_status(struct ceph_client *client, int *total, int *avail, int *used, int *reserved, int *min) { + struct ceph_mds_client *mdsc = &client->mdsc; + if (total) - *total = caps_total_count; + *total = mdsc->caps_total_count; if (avail) - *avail = caps_avail_count; + *avail = mdsc->caps_avail_count; if (used) - *used = caps_use_count; + *used = mdsc->caps_use_count; if (reserved) - *reserved = caps_reserve_count; + *reserved = mdsc->caps_reserve_count; if (min) - *min = caps_min_count; + *min = mdsc->caps_min_count; } /* @@ -540,7 +531,7 @@ retry: new_cap = NULL; } else { spin_unlock(&inode->i_lock); - new_cap = get_cap(caps_reservation); + new_cap = get_cap(mdsc, caps_reservation); if (new_cap == NULL) return -ENOMEM; goto retry; @@ -898,7 +889,7 @@ void __ceph_remove_cap(struct ceph_cap *cap) ci->i_auth_cap = NULL; if (removed) - ceph_put_cap(cap); + ceph_put_cap(mdsc, cap); if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { struct ceph_snap_realm *realm = ci->i_snap_realm; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6e40db2a0014..641a8a37e7b3 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -449,7 +449,7 @@ void ceph_mdsc_release_request(struct kref *kref) kfree(req->r_path1); kfree(req->r_path2); put_request_session(req); - ceph_unreserve_caps(&req->r_caps_reservation); + ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); kfree(req); } @@ -512,7 +512,8 @@ static void __register_request(struct ceph_mds_client *mdsc, { req->r_tid = ++mdsc->last_tid; if (req->r_num_caps) - ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps); + ceph_reserve_caps(mdsc, &req->r_caps_reservation, + req->r_num_caps); dout("__register_request %p tid %lld\n", req, req->r_tid); ceph_mdsc_get_request(req); __insert_request(mdsc, req); @@ -764,7 +765,7 @@ static int iterate_session_caps(struct ceph_mds_session *session, last_inode = NULL; } if (old_cap) { - ceph_put_cap(old_cap); + ceph_put_cap(session->s_mdsc, old_cap); old_cap = NULL; } @@ -793,7 +794,7 @@ out: if (last_inode) iput(last_inode); if (old_cap) - ceph_put_cap(old_cap); + ceph_put_cap(session->s_mdsc, old_cap); return ret; } @@ -1251,6 +1252,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) return ERR_PTR(-ENOMEM); mutex_init(&req->r_fill_mutex); + req->r_mdsc = mdsc; req->r_started = jiffies; req->r_resend_mds = -1; INIT_LIST_HEAD(&req->r_unsafe_dir_item); @@ -1986,7 +1988,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (err == 0) { if (result == 0 && rinfo->dir_nr) ceph_readdir_prepopulate(req, req->r_session); - ceph_unreserve_caps(&req->r_caps_reservation); + ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } mutex_unlock(&req->r_fill_mutex); @@ -2767,6 +2769,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) spin_lock_init(&mdsc->dentry_lru_lock); INIT_LIST_HEAD(&mdsc->dentry_lru); + ceph_caps_init(mdsc); + ceph_adjust_min_caps(mdsc, client->min_caps); + return 0; } @@ -2962,6 +2967,7 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc) if (mdsc->mdsmap) ceph_mdsmap_destroy(mdsc->mdsmap); kfree(mdsc->sessions); + ceph_caps_finalize(mdsc); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index e389902db131..8f2126321f2d 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -151,6 +151,7 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, struct ceph_mds_request { u64 r_tid; /* transaction id */ struct rb_node r_node; + struct ceph_mds_client *r_mdsc; int r_op; /* mds op code */ int r_mds; @@ -267,6 +268,27 @@ struct ceph_mds_client { spinlock_t cap_dirty_lock; /* protects above items */ wait_queue_head_t cap_flushing_wq; + /* + * Cap reservations + * + * Maintain a global pool of preallocated struct ceph_caps, referenced + * by struct ceph_caps_reservations. This ensures that we preallocate + * memory needed to successfully process an MDS response. (If an MDS + * sends us cap information and we fail to process it, we will have + * problems due to the client and MDS being out of sync.) + * + * Reservations are 'owned' by a ceph_cap_reservation context. + */ + spinlock_t caps_list_lock; + struct list_head caps_list; /* unused (reserved or + unreserved) */ + int caps_total_count; /* total caps allocated */ + int caps_use_count; /* in use */ + int caps_reserve_count; /* unused, reserved */ + int caps_avail_count; /* unused, unreserved */ + int caps_min_count; /* keep at least this many + (unreserved) */ + #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_file; #endif diff --git a/fs/ceph/super.c b/fs/ceph/super.c index fa87f51e38e1..1a0bb4863a5d 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -630,7 +630,6 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) /* caps */ client->min_caps = args->max_readdir; - ceph_adjust_min_caps(client->min_caps); /* subsystems */ err = ceph_monc_init(&client->monc, client); @@ -680,8 +679,6 @@ static void ceph_destroy_client(struct ceph_client *client) ceph_monc_stop(&client->monc); - ceph_adjust_min_caps(-client->min_caps); - ceph_debugfs_client_cleanup(client); destroy_workqueue(client->wb_wq); destroy_workqueue(client->pg_inv_wq); @@ -1043,8 +1040,6 @@ static int __init init_ceph(void) if (ret) goto out_msgr; - ceph_caps_init(); - ret = register_filesystem(&ceph_fs_type); if (ret) goto out_icache; @@ -1069,7 +1064,6 @@ static void __exit exit_ceph(void) { dout("exit_ceph\n"); unregister_filesystem(&ceph_fs_type); - ceph_caps_finalize(); destroy_caches(); ceph_msgr_exit(); ceph_debugfs_cleanup(); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 10a4a406e887..44d10cb0aeca 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -560,11 +560,13 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci) /* what the mds thinks we want */ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); -extern void ceph_caps_init(void); -extern void ceph_caps_finalize(void); -extern void ceph_adjust_min_caps(int delta); -extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need); -extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx); +extern void ceph_caps_init(struct ceph_mds_client *mdsc); +extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); +extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); +extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx, int need); +extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx); extern void ceph_reservation_status(struct ceph_client *client, int *total, int *avail, int *used, int *reserved, int *min); @@ -806,7 +808,8 @@ static inline void ceph_remove_cap(struct ceph_cap *cap) __ceph_remove_cap(cap); spin_unlock(&inode->i_lock); } -extern void ceph_put_cap(struct ceph_cap *cap); +extern void ceph_put_cap(struct ceph_mds_client *mdsc, + struct ceph_cap *cap); extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); From 796d6955a51ce6768d0e033f27a2f8f5be6cb39a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 10 Jun 2010 15:50:10 -0700 Subject: [PATCH 12/39] ceph: only set num_pages in calc_layout Setting it elsewhere is unnecessary and more fragile. Signed-off-by: Sage Weil --- fs/ceph/osd_client.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index e38522347898..707d2dbd8776 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -1276,8 +1276,6 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, /* it may be a short read due to an object boundary */ req->r_pages = pages; - num_pages = calc_pages_for(off, *plen); - req->r_num_pages = num_pages; dout("readpages final extent is %llu~%llu (%d pages)\n", off, *plen, req->r_num_pages); @@ -1319,7 +1317,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, /* it may be a short write due to an object boundary */ req->r_pages = pages; - req->r_num_pages = calc_pages_for(off, len); dout("writepages %llu~%llu (%d pages)\n", off, len, req->r_num_pages); From ed0552a1a21d2f2692b84c366ce04ad17377780c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 21 Jun 2010 13:38:25 -0700 Subject: [PATCH 13/39] ceph: introduce helper to connect to mds export targets There are a few cases where we need to open sessions with a given mds's potential export targets. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 641a8a37e7b3..462602ec7fb1 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -704,6 +704,43 @@ static int __open_session(struct ceph_mds_client *mdsc, return 0; } +/* + * open sessions for any export targets for the given mds + * + * called under mdsc->mutex + */ +static void __open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_info *mi; + struct ceph_mds_session *ts; + int i, mds = session->s_mds; + int target; + + if (mds >= mdsc->mdsmap->m_max_mds) + return; + mi = &mdsc->mdsmap->m_info[mds]; + dout("open_export_target_sessions for mds%d (%d targets)\n", + session->s_mds, mi->num_export_targets); + + for (i = 0; i < mi->num_export_targets; i++) { + target = mi->export_targets[i]; + ts = __ceph_lookup_mds_session(mdsc, target); + if (!ts) { + ts = register_session(mdsc, target); + if (IS_ERR(ts)) + return; + } + if (session->s_state == CEPH_MDS_SESSION_NEW || + session->s_state == CEPH_MDS_SESSION_CLOSING) + __open_session(mdsc, session); + else + dout(" mds%d target mds%d %p is %s\n", session->s_mds, + i, ts, session_state_name(ts->s_state)); + ceph_put_mds_session(ts); + } +} + /* * session caps */ From cb170a22153730eb9c82b6c85ead2001dba6c41a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 21 Jun 2010 13:38:35 -0700 Subject: [PATCH 14/39] ceph: connect to export targets if mds is laggy If an MDS we are talking to may have failed, we need to open sessions to its potential export targets to ensure that any in-progress migration that may have involved some of our caps is properly handled. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 462602ec7fb1..552b934c9cd0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2470,6 +2470,21 @@ static void check_new_map(struct ceph_mds_client *mdsc, wake_up_session_caps(s, 1); } } + + for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) { + s = mdsc->sessions[i]; + if (!s) + continue; + if (!ceph_mdsmap_is_laggy(newmap, i)) + continue; + if (s->s_state == CEPH_MDS_SESSION_OPEN || + s->s_state == CEPH_MDS_SESSION_HUNG || + s->s_state == CEPH_MDS_SESSION_CLOSING) { + dout(" connecting to export targets of laggy mds%d\n", + i); + __open_export_target_sessions(mdsc, s); + } + } } From 154f42c2c3c3b66a7a63dad5648e8a9860a32af9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 21 Jun 2010 13:45:04 -0700 Subject: [PATCH 15/39] ceph: connect to export targets on cap export When we get a cap EXPORT message, make sure we are connected to all export targets to ensure we can handle the matching IMPORT. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 14 ++++++++++++-- fs/ceph/mds_client.c | 8 ++++++++ fs/ceph/mds_client.h | 3 +++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 47068b10baf8..52befa65fbf7 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2573,7 +2573,8 @@ static void handle_cap_trunc(struct inode *inode, * caller holds s_mutex */ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, - struct ceph_mds_session *session) + struct ceph_mds_session *session, + int *open_target_sessions) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -2605,6 +2606,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, ci->i_cap_exporting_mds = mds; ci->i_cap_exporting_mseq = mseq; ci->i_cap_exporting_issued = cap->issued; + + /* + * make sure we have open sessions with all possible + * export targets, so that we get the matching IMPORT + */ + *open_target_sessions = 1; } __ceph_remove_cap(cap); } @@ -2680,6 +2687,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, u64 size, max_size; u64 tid; void *snaptrace; + int open_target_sessions = 0; dout("handle_caps from mds%d\n", mds); @@ -2731,7 +2739,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, goto done; case CEPH_CAP_OP_EXPORT: - handle_cap_export(inode, h, session); + handle_cap_export(inode, h, session, &open_target_sessions); goto done; case CEPH_CAP_OP_IMPORT: @@ -2778,6 +2786,8 @@ done: done_unlocked: if (inode) iput(inode); + if (open_target_sessions) + ceph_mdsc_open_export_target_sessions(mdsc, session); return; bad: diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 552b934c9cd0..a546e0ddb8e3 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -741,6 +741,14 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc, } } +void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + mutex_lock(&mdsc->mutex); + __open_export_target_sessions(mdsc, session); + mutex_unlock(&mdsc->mutex); +} + /* * session caps */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 8f2126321f2d..c86be30e8707 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -364,4 +364,7 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg); +extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); + #endif From 2bc50259fa0aa1868f8b2ba1d374406cb3c57f72 Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Wed, 30 Jun 2010 12:44:34 -0700 Subject: [PATCH 16/39] ceph: add ceph_get_cap_for_mds function. Signed-off-by: Greg Farnum Signed-off-by: Sage Weil --- fs/ceph/caps.c | 10 ++++++++++ fs/ceph/super.h | 2 ++ 2 files changed, 12 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 52befa65fbf7..e3b848d7698d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -327,6 +327,16 @@ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) return NULL; } +struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) +{ + struct ceph_cap *cap; + + spin_lock(&ci->vfs_inode.i_lock); + cap = __get_cap_for_mds(ci, mds); + spin_unlock(&ci->vfs_inode.i_lock); + return cap; +} + /* * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 44d10cb0aeca..8ceb62380d3f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -816,6 +816,8 @@ extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_fsync(struct file *file, int datasync); extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); +extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, + int mds); extern int ceph_get_cap_mds(struct inode *inode); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); From e55b71f802fd448a79275ba7b263fe1a8639be5f Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Tue, 22 Jun 2010 15:58:01 -0700 Subject: [PATCH 17/39] ceph: handle ESTALE properly; on receipt send to authority if it wasn't Signed-off-by: Greg Farnum Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 41 ++++++++++++++++++++++++++++++++++------- fs/ceph/mds_client.h | 2 +- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a546e0ddb8e3..34d215ff4c82 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1628,6 +1628,15 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, req->r_mds = mds; req->r_attempts++; + if (req->r_inode) { + struct ceph_cap *cap = + ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds); + + if (cap) + req->r_sent_on_mseq = cap->mseq; + else + req->r_sent_on_mseq = -1; + } dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); @@ -1962,21 +1971,39 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) result = le32_to_cpu(head->result); /* - * Tolerate 2 consecutive ESTALEs from the same mds. - * FIXME: we should be looking at the cap migrate_seq. + * Handle an ESTALE + * if we're not talking to the authority, send to them + * if the authority has changed while we weren't looking, + * send to new authority + * Otherwise we just have to return an ESTALE */ if (result == -ESTALE) { - req->r_direct_mode = USE_AUTH_MDS; - req->r_num_stale++; - if (req->r_num_stale <= 2) { + dout("got ESTALE on request %llu", req->r_tid); + if (!req->r_inode) ; //do nothing; not an authority problem + else if (req->r_direct_mode != USE_AUTH_MDS) { + dout("not using auth, setting for that now"); + req->r_direct_mode = USE_AUTH_MDS; __do_request(mdsc, req); mutex_unlock(&mdsc->mutex); goto out; + } else { + struct ceph_inode_info *ci = ceph_inode(req->r_inode); + struct ceph_cap *cap = + ceph_get_cap_for_mds(ci, req->r_mds);; + + dout("already using auth"); + if ((!cap || cap != ci->i_auth_cap) || + (cap->mseq != req->r_sent_on_mseq)) { + dout("but cap changed, so resending"); + __do_request(mdsc, req); + mutex_unlock(&mdsc->mutex); + goto out; + } } - } else { - req->r_num_stale = 0; + dout("have to return ESTALE on request %llu", req->r_tid); } + if (head->safe) { req->r_got_safe = true; __unregister_request(mdsc, req); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index c86be30e8707..ab7e89f5e344 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -208,8 +208,8 @@ struct ceph_mds_request { int r_attempts; /* resend attempts */ int r_num_fwd; /* number of forward attempts */ - int r_num_stale; int r_resend_mds; /* mds to resend to next, if any*/ + u32 r_sent_on_mseq; /* cap mseq request was sent at*/ struct kref r_kref; struct list_head r_wait; From e0f9f9ee8f6cb60fe49e32e1df790a698ce0840c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 30 Jun 2010 12:45:29 -0700 Subject: [PATCH 18/39] ceph: remove unused 'monport' mount option Signed-off-by: Sage Weil --- fs/ceph/super.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 1a0bb4863a5d..c1ea38e5aebd 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -324,7 +324,6 @@ const char *ceph_msg_type_name(int type) enum { Opt_fsidmajor, Opt_fsidminor, - Opt_monport, Opt_wsize, Opt_rsize, Opt_osdtimeout, @@ -357,7 +356,6 @@ enum { static match_table_t arg_tokens = { {Opt_fsidmajor, "fsidmajor=%ld"}, {Opt_fsidminor, "fsidminor=%ld"}, - {Opt_monport, "monport=%d"}, {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, {Opt_osdtimeout, "osdtimeout=%d"}, From c309f0ab26ca37663f368918553d02e90356c89d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 30 Jun 2010 21:34:01 -0700 Subject: [PATCH 19/39] ceph: clean up fsid mount option Specify the fsid mount option in hex, not via the major/minor u64 hackery we had before. Signed-off-by: Sage Weil --- fs/ceph/super.c | 52 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c1ea38e5aebd..3100c909cbb1 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -2,6 +2,7 @@ #include "ceph_debug.h" #include +#include #include #include #include @@ -150,9 +151,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) struct ceph_mount_args *args = client->mount_args; if (args->flags & CEPH_OPT_FSID) - seq_printf(m, ",fsidmajor=%llu,fsidminor%llu", - le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]), - le64_to_cpu(*(__le64 *)&args->fsid.fsid[8])); + seq_printf(m, ",fsid=" FSID_FORMAT, PR_FSID(&args->fsid)); if (args->flags & CEPH_OPT_NOSHARE) seq_puts(m, ",noshare"); if (args->flags & CEPH_OPT_DIRSTAT) @@ -322,8 +321,6 @@ const char *ceph_msg_type_name(int type) * mount options */ enum { - Opt_fsidmajor, - Opt_fsidminor, Opt_wsize, Opt_rsize, Opt_osdtimeout, @@ -338,6 +335,7 @@ enum { Opt_congestion_kb, Opt_last_int, /* int args above */ + Opt_fsid, Opt_snapdirname, Opt_name, Opt_secret, @@ -354,8 +352,6 @@ enum { }; static match_table_t arg_tokens = { - {Opt_fsidmajor, "fsidmajor=%ld"}, - {Opt_fsidminor, "fsidminor=%ld"}, {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, {Opt_osdtimeout, "osdtimeout=%d"}, @@ -369,6 +365,7 @@ static match_table_t arg_tokens = { {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, {Opt_congestion_kb, "write_congestion_kb=%d"}, /* int args above */ + {Opt_fsid, "fsid=%s"}, {Opt_snapdirname, "snapdirname=%s"}, {Opt_name, "name=%s"}, {Opt_secret, "secret=%s"}, @@ -384,6 +381,36 @@ static match_table_t arg_tokens = { {-1, NULL} }; +static int parse_fsid(const char *str, struct ceph_fsid *fsid) +{ + int i = 0; + char tmp[3]; + int err = -EINVAL; + int d; + + dout("parse_fsid '%s'\n", str); + tmp[2] = 0; + while (*str && i < 16) { + if (ispunct(*str)) { + str++; + continue; + } + if (!isxdigit(str[0]) || !isxdigit(str[1])) + break; + tmp[0] = str[0]; + tmp[1] = str[1]; + if (sscanf(tmp, "%x", &d) < 1) + break; + fsid->fsid[i] = d & 0xff; + i++; + str += 2; + } + + if (i == 16) + err = 0; + dout("parse_fsid ret %d got fsid " FSID_FORMAT, err, PR_FSID(fsid)); + return err; +} static struct ceph_mount_args *parse_mount_args(int flags, char *options, const char *dev_name, @@ -467,12 +494,6 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, dout("got token %d\n", token); } switch (token) { - case Opt_fsidmajor: - *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval); - break; - case Opt_fsidminor: - *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval); - break; case Opt_ip: err = ceph_parse_ips(argstr[0].from, argstr[0].to, @@ -483,6 +504,11 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, args->flags |= CEPH_OPT_MYIP; break; + case Opt_fsid: + err = parse_fsid(argstr[0].from, &args->fsid); + if (err == 0) + args->flags |= CEPH_OPT_FSID; + break; case Opt_snapdirname: kfree(args->snapdir_name); args->snapdir_name = kstrndup(argstr[0].from, From 6a2593823ababdada2636398ac190931517b51cf Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 7 Jul 2010 09:06:08 -0700 Subject: [PATCH 20/39] ceph: specify supported features in super.h Specify the supported/required feature bits in super.h client code instead of using the definitions from the shared kernel/userspace headers (which will go away shortly). Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 6 +++--- fs/ceph/super.h | 6 ++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 71263a2a78d7..79b0995dc825 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -647,7 +647,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); - con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED_CLIENT); + con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED); con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -1123,8 +1123,8 @@ static void fail_protocol(struct ceph_connection *con) static int process_connect(struct ceph_connection *con) { - u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT; - u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT; + u64 sup_feat = CEPH_FEATURE_SUPPORTED; + u64 req_feat = CEPH_FEATURE_REQUIRED; u64 server_feat = le64_to_cpu(con->in_reply.features); dout("process_connect on %p tag %d\n", con, (int)con->in_tag); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 8ceb62380d3f..af62081a9535 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -30,6 +30,12 @@ #define CEPH_BLOCK_SHIFT 20 /* 1 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) +/* + * Supported features + */ +#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR + /* * mount options */ From 9688f19a188e5ef6ac8a4b6a5b17273028d1090e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 30 Jul 2010 10:18:04 -0700 Subject: [PATCH 21/39] ceph: strip misleading/obsolete version, feature info Signed-off-by: Sage Weil --- fs/ceph/ceph_fs.h | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index c2e4c61a99a3..86fadeb9fd42 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h @@ -15,20 +15,6 @@ #include "msgr.h" #include "rados.h" -/* - * Ceph release version - */ -#define CEPH_VERSION_MAJOR 0 -#define CEPH_VERSION_MINOR 20 -#define CEPH_VERSION_PATCH 0 - -#define _CEPH_STRINGIFY(x) #x -#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x) -#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \ - "." CEPH_STRINGIFY(z) -#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \ - CEPH_VERSION_MINOR, CEPH_VERSION_PATCH) - /* * subprotocol versions. when specific messages types or high-level * protocols change, bump the affected components. we keep rev @@ -53,18 +39,10 @@ /* * feature bits */ -#define CEPH_FEATURE_UID 1 -#define CEPH_FEATURE_NOSRCADDR 2 -#define CEPH_FEATURE_FLOCK 4 - -#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR -#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID -#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK -#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID -#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR -#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID -#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR -#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_UID (1<<0) +#define CEPH_FEATURE_NOSRCADDR (1<<1) +#define CEPH_FEATURE_MONCLOCKCHECK (1<<2) +#define CEPH_FEATURE_FLOCK (1<<3) /* From 5cd068c200795537fd78c423254389c8a07e2d20 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 7 Jul 2010 08:38:17 -0700 Subject: [PATCH 22/39] ceph: clean up header guards Signed-off-by: Sage Weil --- fs/ceph/ceph_frag.h | 4 ++-- fs/ceph/ceph_fs.h | 4 ++-- fs/ceph/ceph_hash.h | 4 ++-- fs/ceph/crush/crush.h | 4 ++-- fs/ceph/crush/hash.h | 4 ++-- fs/ceph/crush/mapper.h | 4 ++-- fs/ceph/msgr.h | 4 ++-- fs/ceph/rados.h | 4 ++-- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h index 793f50cb7c22..5babb8e95352 100644 --- a/fs/ceph/ceph_frag.h +++ b/fs/ceph/ceph_frag.h @@ -1,5 +1,5 @@ -#ifndef _FS_CEPH_FRAG_H -#define _FS_CEPH_FRAG_H +#ifndef FS_CEPH_FRAG_H +#define FS_CEPH_FRAG_H /* * "Frags" are a way to describe a subset of a 32-bit number space, diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index 86fadeb9fd42..663ff9021111 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h @@ -9,8 +9,8 @@ * LGPL2 */ -#ifndef _FS_CEPH_CEPH_FS_H -#define _FS_CEPH_CEPH_FS_H +#ifndef CEPH_FS_H +#define CEPH_FS_H #include "msgr.h" #include "rados.h" diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h index 5ac470c433c9..d099c3f90236 100644 --- a/fs/ceph/ceph_hash.h +++ b/fs/ceph/ceph_hash.h @@ -1,5 +1,5 @@ -#ifndef _FS_CEPH_HASH_H -#define _FS_CEPH_HASH_H +#ifndef FS_CEPH_HASH_H +#define FS_CEPH_HASH_H #define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */ #define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */ diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h index dcd7e7523700..97e435b191f4 100644 --- a/fs/ceph/crush/crush.h +++ b/fs/ceph/crush/crush.h @@ -1,5 +1,5 @@ -#ifndef _CRUSH_CRUSH_H -#define _CRUSH_CRUSH_H +#ifndef CEPH_CRUSH_CRUSH_H +#define CEPH_CRUSH_CRUSH_H #include diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h index ff48e110e4bb..91e884230d5d 100644 --- a/fs/ceph/crush/hash.h +++ b/fs/ceph/crush/hash.h @@ -1,5 +1,5 @@ -#ifndef _CRUSH_HASH_H -#define _CRUSH_HASH_H +#ifndef CEPH_CRUSH_HASH_H +#define CEPH_CRUSH_HASH_H #define CRUSH_HASH_RJENKINS1 0 diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h index 98e90046fd9f..c46b99c18bb0 100644 --- a/fs/ceph/crush/mapper.h +++ b/fs/ceph/crush/mapper.h @@ -1,5 +1,5 @@ -#ifndef _CRUSH_MAPPER_H -#define _CRUSH_MAPPER_H +#ifndef CEPH_CRUSH_MAPPER_H +#define CEPH_CRUSH_MAPPER_H /* * CRUSH functions for find rules and then mapping an input to an diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h index 892a0298dfdf..680d3d648cac 100644 --- a/fs/ceph/msgr.h +++ b/fs/ceph/msgr.h @@ -1,5 +1,5 @@ -#ifndef __MSGR_H -#define __MSGR_H +#ifndef CEPH_MSGR_H +#define CEPH_MSGR_H /* * Data types for message passing layer used by Ceph. diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h index 8fcc023056c7..20b5b45fa575 100644 --- a/fs/ceph/rados.h +++ b/fs/ceph/rados.h @@ -1,5 +1,5 @@ -#ifndef __RADOS_H -#define __RADOS_H +#ifndef CEPH_RADOS_H +#define CEPH_RADOS_H /* * Data types for the Ceph distributed object storage layer RADOS From f0b18d9f22ea4e50955945661b7e165a47705249 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 7 Jul 2010 08:39:03 -0700 Subject: [PATCH 23/39] ceph: sync header defs with server code Define ROLLBACK op, IFLOCK inode lock (for advisory file locking). Signed-off-by: Sage Weil --- fs/ceph/ceph_fs.h | 1 + fs/ceph/ceph_strings.c | 1 + fs/ceph/rados.h | 9 +++++++++ 3 files changed, 11 insertions(+) diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index 663ff9021111..bf7dea673237 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h @@ -253,6 +253,7 @@ extern const char *ceph_mds_state_name(int s); #define CEPH_LOCK_IDFT 512 /* dir frag tree */ #define CEPH_LOCK_INEST 1024 /* mds internal */ #define CEPH_LOCK_IXATTR 2048 +#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */ #define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ /* client_session ops */ diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c index 7503aee828ce..0f943a0a3b5e 100644 --- a/fs/ceph/ceph_strings.c +++ b/fs/ceph/ceph_strings.c @@ -28,6 +28,7 @@ const char *ceph_osd_op_name(int op) case CEPH_OSD_OP_TRUNCATE: return "truncate"; case CEPH_OSD_OP_ZERO: return "zero"; case CEPH_OSD_OP_WRITEFULL: return "writefull"; + case CEPH_OSD_OP_ROLLBACK: return "rollback"; case CEPH_OSD_OP_APPEND: return "append"; case CEPH_OSD_OP_STARTSYNC: return "startsync"; diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h index 20b5b45fa575..6d5247f2e81b 100644 --- a/fs/ceph/rados.h +++ b/fs/ceph/rados.h @@ -203,6 +203,7 @@ enum { CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12, CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, + CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14, /** attrs **/ /* read */ @@ -272,6 +273,10 @@ static inline int ceph_osd_op_mode_modify(int op) return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; } +/* + * note that the following tmap stuff is also defined in the ceph librados.h + * any modification here needs to be updated there + */ #define CEPH_OSD_TMAP_HDR 'h' #define CEPH_OSD_TMAP_SET 's' #define CEPH_OSD_TMAP_RM 'r' @@ -297,6 +302,7 @@ enum { CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ + CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */ }; enum { @@ -350,6 +356,9 @@ struct ceph_osd_op { struct { __le64 cookie, count; } __attribute__ ((packed)) pgls; + struct { + __le64 snapid; + } __attribute__ ((packed)) snap; }; __le32 payload_len; } __attribute__ ((packed)); From a8b763a9b34561fea8e616c1439a71913ff2c1bd Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 8 Jul 2010 13:00:18 -0700 Subject: [PATCH 24/39] ceph: use %pU to print uuid (fsid) Signed-off-by: Sage Weil --- fs/ceph/debugfs.c | 4 ++-- fs/ceph/super.c | 12 ++++++------ fs/ceph/super.h | 7 ------- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index f2f5332ddbba..968dc5a86c3e 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -361,8 +361,8 @@ int ceph_debugfs_client_init(struct ceph_client *client) int ret = 0; char name[80]; - snprintf(name, sizeof(name), FSID_FORMAT ".client%lld", - PR_FSID(&client->fsid), client->monc.auth->global_id); + snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, + client->monc.auth->global_id); client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); if (!client->debugfs_dir) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 3100c909cbb1..d80699a4dc26 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -151,7 +151,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) struct ceph_mount_args *args = client->mount_args; if (args->flags & CEPH_OPT_FSID) - seq_printf(m, ",fsid=" FSID_FORMAT, PR_FSID(&args->fsid)); + seq_printf(m, ",fsid=%pU", &args->fsid); if (args->flags & CEPH_OPT_NOSHARE) seq_puts(m, ",noshare"); if (args->flags & CEPH_OPT_DIRSTAT) @@ -408,7 +408,7 @@ static int parse_fsid(const char *str, struct ceph_fsid *fsid) if (i == 16) err = 0; - dout("parse_fsid ret %d got fsid " FSID_FORMAT, err, PR_FSID(fsid)); + dout("parse_fsid ret %d got fsid %pU", err, fsid); return err; } @@ -727,13 +727,13 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) { if (client->have_fsid) { if (ceph_fsid_compare(&client->fsid, fsid)) { - pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT, - PR_FSID(&client->fsid), PR_FSID(fsid)); + pr_err("bad fsid, had %pU got %pU", + &client->fsid, fsid); return -1; } } else { - pr_info("client%lld fsid " FSID_FORMAT "\n", - client->monc.auth->global_id, PR_FSID(fsid)); + pr_info("client%lld fsid %pU\n", client->monc.auth->global_id, + fsid); memcpy(&client->fsid, fsid, sizeof(*fsid)); ceph_debugfs_client_init(client); client->have_fsid = true; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index af62081a9535..281e458db8b3 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -746,13 +746,6 @@ extern struct kmem_cache *ceph_file_cachep; extern const char *ceph_msg_type_name(int type); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); -#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \ - "%02x%02x%02x%02x%02x%02x" -#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \ - (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \ - (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \ - (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15] - /* inode.c */ extern const struct inode_operations ceph_file_iops; From effcb9ed43d16db27ae5837d93879e067e902151 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 9 Jul 2010 11:00:08 -0700 Subject: [PATCH 25/39] ceph: print useful error message when crush rule not found Include the crush_ruleset in the error message. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 46b391d8e86c..1d5f58cc2d93 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -1026,8 +1026,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, pool->v.type, pool->v.size); if (ruleno < 0) { - pr_err("no crush rule pool %d type %d size %d\n", - poolid, pool->v.type, pool->v.size); + pr_err("no crush rule pool %d ruleset %d type %d size %d\n", + poolid, pool->v.crush_ruleset, pool->v.type, + pool->v.size); return NULL; } From b8cd07e78eaa49857e882f4199309f86aeb80bbd Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 16 Jul 2010 12:00:02 -0700 Subject: [PATCH 26/39] ceph: warn on missing snap realm Well, this Shouldn't Happen, so it would be helpful to know the caller when it does. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index e3b848d7698d..30acc7b046ee 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -586,6 +586,7 @@ retry: } else { pr_err("ceph_add_cap: couldn't find snap realm %llx\n", realmino); + WARN_ON(!realm); } } From 2d9c98ae97c18e8b1c363af6a2e51d5d9e8c5e04 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 30 Jul 2010 09:38:13 -0700 Subject: [PATCH 27/39] ceph: make ->sync_fs not wait if wait==0 The ->sync_fs() super op only needs to wait if wait is true. Otherwise, just get some dirty cap writeback started. Signed-off-by: Sage Weil --- fs/ceph/super.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index d80699a4dc26..7f751f2850ba 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -102,12 +102,21 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) } -static int ceph_syncfs(struct super_block *sb, int wait) +static int ceph_sync_fs(struct super_block *sb, int wait) { - dout("sync_fs %d\n", wait); + struct ceph_client *client = ceph_sb_to_client(sb); + + if (!wait) { + dout("sync_fs (non-blocking)\n"); + ceph_flush_dirty_caps(&client->mdsc); + dout("sync_fs (non-blocking) done\n"); + return 0; + } + + dout("sync_fs (blocking)\n"); ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); - dout("sync_fs %d done\n", wait); + dout("sync_fs (blocking) done\n"); return 0; } @@ -278,7 +287,7 @@ static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .destroy_inode = ceph_destroy_inode, .write_inode = ceph_write_inode, - .sync_fs = ceph_syncfs, + .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, .show_options = ceph_show_options, .statfs = ceph_statfs, From 73a7e693f9da464b0df07643af3f8ffc04dcc0b5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 2 Aug 2010 11:00:55 -0700 Subject: [PATCH 28/39] ceph: fix decoding of pool snap info The pool info contains a vector for snap_info_t, not snap ids. This fixes the broken decoding, which would declare teh update corrupt when a pool snapshot was created. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 1d5f58cc2d93..e94c6fb2e2a4 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -424,12 +424,30 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) kfree(pi); } -static void __decode_pool(void **p, struct ceph_pg_pool_info *pi) +static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) { + unsigned n, m; + ceph_decode_copy(p, &pi->v, sizeof(pi->v)); calc_pg_masks(pi); - *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); + + /* num_snaps * snap_info_t */ + n = le32_to_cpu(pi->v.num_snaps); + while (n--) { + ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) + + sizeof(struct ceph_timespec), bad); + *p += sizeof(u64) + /* key */ + 1 + sizeof(u64) + /* u8, snapid */ + sizeof(struct ceph_timespec); + m = ceph_decode_32(p); /* snap name */ + *p += m; + } + *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; + return 0; + +bad: + return -EINVAL; } static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) @@ -571,7 +589,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) kfree(pi); goto bad; } - __decode_pool(p, pi); + err = __decode_pool(p, end, pi); + if (err < 0) + goto bad; __insert_pg_pool(&map->pg_pools, pi); } @@ -760,7 +780,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, pi->id = pool; __insert_pg_pool(&map->pg_pools, pi); } - __decode_pool(p, pi); + err = __decode_pool(p, end, pi); + if (err < 0) + goto bad; } if (version >= 5 && __decode_pool_names(p, end, map) < 0) goto bad; From cbbfe499055f49c09210e04d9f88c2f483052384 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 2 Aug 2010 15:48:23 -0700 Subject: [PATCH 29/39] ceph: move AES iv definition to shared header Signed-off-by: Sage Weil --- fs/ceph/ceph_fs.h | 2 ++ fs/ceph/crypto.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index bf7dea673237..5bf4ec2cf9eb 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h @@ -74,6 +74,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); #define CEPH_CRYPTO_NONE 0x0 #define CEPH_CRYPTO_AES 0x1 +#define CEPH_AES_IV "cephsageyudagreg" + /* security/authentication protocols */ #define CEPH_AUTH_UNKNOWN 0x0 #define CEPH_AUTH_NONE 0x1 diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index c9318278f419..a3e627f63293 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -75,7 +75,7 @@ static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void) return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); } -static const u8 *aes_iv = "cephsageyudagreg"; +static const u8 *aes_iv = (u8 *)CEPH_AES_IV; static int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len, From ce1fbc8dd657a4bbcf26c683c9d07c88db83fd86 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 2 Aug 2010 15:09:39 -0700 Subject: [PATCH 30/39] ceph: support v2 client_caps encoding Add support for v2 encoding of MClientCaps, which includes a flock blob. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 30acc7b046ee..51546d54b841 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2698,6 +2698,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, u64 size, max_size; u64 tid; void *snaptrace; + size_t snaptrace_len; + void *flock; + u32 flock_len; int open_target_sessions = 0; dout("handle_caps from mds%d\n", mds); @@ -2707,7 +2710,6 @@ void ceph_handle_caps(struct ceph_mds_session *session, if (msg->front.iov_len < sizeof(*h)) goto bad; h = msg->front.iov_base; - snaptrace = h + 1; op = le32_to_cpu(h->op); vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; @@ -2717,6 +2719,21 @@ void ceph_handle_caps(struct ceph_mds_session *session, size = le64_to_cpu(h->size); max_size = le64_to_cpu(h->max_size); + snaptrace = h + 1; + snaptrace_len = le32_to_cpu(h->snap_trace_len); + + if (le16_to_cpu(msg->hdr.version) >= 2) { + void *p, *end; + + p = snaptrace + snaptrace_len; + end = msg->front.iov_base + msg->front.iov_len; + ceph_decode_32_safe(&p, end, flock_len, bad); + flock = p; + } else { + flock = NULL; + flock_len = 0; + } + mutex_lock(&session->s_mutex); session->s_seq++; dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, @@ -2755,7 +2772,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, case CEPH_CAP_OP_IMPORT: handle_cap_import(mdsc, inode, h, session, - snaptrace, le32_to_cpu(h->snap_trace_len)); + snaptrace, snaptrace_len); ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, session); goto done_unlocked; From 20cb34ae9e4b008a8789a48d52f5aa279dc400b6 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 12 May 2010 15:21:32 -0700 Subject: [PATCH 31/39] ceph: support v2 reconnect encoding Encode either old or v2 encoding of client_reconnect message, depending on whether the peer has the FLOCK feature bit. Signed-off-by: Sage Weil --- fs/ceph/ceph_fs.h | 11 +++++++++- fs/ceph/mds_client.c | 52 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 50 insertions(+), 13 deletions(-) diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index 5bf4ec2cf9eb..bb17a18cc190 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h @@ -632,6 +632,16 @@ struct ceph_mds_lease { /* client reconnect */ struct ceph_mds_cap_reconnect { + __le64 cap_id; + __le32 wanted; + __le32 issued; + __le64 snaprealm; + __le64 pathbase; /* base ino for our path to this ino */ + __le32 flock_len; /* size of flock state blob, if any */ +} __attribute__ ((packed)); +/* followed by flock blob */ + +struct ceph_mds_cap_reconnect_v1 { __le64 cap_id; __le32 wanted; __le32 issued; @@ -640,7 +650,6 @@ struct ceph_mds_cap_reconnect { __le64 snaprealm; __le64 pathbase; /* base ino for our path to this ino */ } __attribute__ ((packed)); -/* followed by encoded string */ struct ceph_mds_snaprealm_reconnect { __le64 ino; /* snap realm base */ diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 34d215ff4c82..615f720a819d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -37,6 +37,11 @@ * are no longer valid. */ +struct ceph_reconnect_state { + struct ceph_pagelist *pagelist; + bool flock; +}; + static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head); @@ -2268,9 +2273,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { - struct ceph_mds_cap_reconnect rec; + union { + struct ceph_mds_cap_reconnect v2; + struct ceph_mds_cap_reconnect_v1 v1; + } rec; + size_t reclen; struct ceph_inode_info *ci; - struct ceph_pagelist *pagelist = arg; + struct ceph_reconnect_state *recon_state = arg; + struct ceph_pagelist *pagelist = recon_state->pagelist; char *path; int pathlen, err; u64 pathbase; @@ -2303,17 +2313,29 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_lock(&inode->i_lock); cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ - rec.cap_id = cpu_to_le64(cap->cap_id); - rec.pathbase = cpu_to_le64(pathbase); - rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); - rec.issued = cpu_to_le32(cap->issued); - rec.size = cpu_to_le64(inode->i_size); - ceph_encode_timespec(&rec.mtime, &inode->i_mtime); - ceph_encode_timespec(&rec.atime, &inode->i_atime); - rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); + + if (recon_state->flock) { + rec.v2.cap_id = cpu_to_le64(cap->cap_id); + rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); + rec.v2.issued = cpu_to_le32(cap->issued); + rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); + rec.v2.pathbase = cpu_to_le64(pathbase); + rec.v2.flock_len = 0; + reclen = sizeof(rec.v2); + } else { + rec.v1.cap_id = cpu_to_le64(cap->cap_id); + rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); + rec.v1.issued = cpu_to_le32(cap->issued); + rec.v1.size = cpu_to_le64(inode->i_size); + ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime); + ceph_encode_timespec(&rec.v1.atime, &inode->i_atime); + rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); + rec.v1.pathbase = cpu_to_le64(pathbase); + reclen = sizeof(rec.v1); + } spin_unlock(&inode->i_lock); - err = ceph_pagelist_append(pagelist, &rec, sizeof(rec)); + err = ceph_pagelist_append(pagelist, &rec, reclen); out: kfree(path); @@ -2342,6 +2364,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds = session->s_mds; int err = -ENOMEM; struct ceph_pagelist *pagelist; + struct ceph_reconnect_state recon_state; pr_info("mds%d reconnect start\n", mds); @@ -2376,7 +2399,10 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); if (err) goto fail; - err = iterate_session_caps(session, encode_caps_cb, pagelist); + + recon_state.pagelist = pagelist; + recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK; + err = iterate_session_caps(session, encode_caps_cb, &recon_state); if (err < 0) goto fail; @@ -2401,6 +2427,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, } reply->pagelist = pagelist; + if (recon_state.flock) + reply->hdr.version = cpu_to_le16(2); reply->hdr.data_len = cpu_to_le32(pagelist->length); reply->nr_pages = calc_pages_for(0, pagelist->length); ceph_con_send(&session->s_con, reply); From c6f3fdc592d61847da0e2172e352dbcb53c83d39 Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Mon, 2 Aug 2010 14:55:24 -0700 Subject: [PATCH 32/39] ceph: add CEPH_FEATURE_FLOCK to the supported feature bits This informs the server that we will accept v2 client_caps format and v2 client_reconnect format messages. Signed-off-by: Greg Farnum Signed-off-by: Sage Weil --- fs/ceph/super.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 281e458db8b3..83f7cc5fab10 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -33,7 +33,7 @@ /* * Supported features */ -#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK #define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR /* From fbaad9797a761c2d5ff6e755bbb4c046207a1ca2 Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Mon, 2 Aug 2010 15:30:08 -0700 Subject: [PATCH 33/39] ceph: define on-wire types, constants for file locking support Define the MDS operations and data types for doing file advisory locking with the MDS. Signed-off-by: Greg Farnum Signed-off-by: Sage Weil --- fs/ceph/ceph_fs.h | 36 ++++++++++++++++++++++++++++++++++-- fs/ceph/ceph_strings.c | 2 ++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index bb17a18cc190..d5619ac86711 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h @@ -297,6 +297,8 @@ enum { CEPH_MDS_OP_RMXATTR = 0x01106, CEPH_MDS_OP_SETLAYOUT = 0x01107, CEPH_MDS_OP_SETATTR = 0x01108, + CEPH_MDS_OP_SETFILELOCK= 0x01109, + CEPH_MDS_OP_GETFILELOCK= 0x00110, CEPH_MDS_OP_MKNOD = 0x01201, CEPH_MDS_OP_LINK = 0x01202, @@ -367,6 +369,15 @@ union ceph_mds_request_args { struct { struct ceph_file_layout layout; } __attribute__ ((packed)) setlayout; + struct { + __u8 rule; /* currently fcntl or flock */ + __u8 type; /* shared, exclusive, remove*/ + __le64 pid; /* process id requesting the lock */ + __le64 pid_namespace; + __le64 start; /* initial location to lock */ + __le64 length; /* num bytes to lock from start */ + __u8 wait; /* will caller wait for lock to become available? */ + } __attribute__ ((packed)) filelock_change; } __attribute__ ((packed)); #define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ @@ -461,6 +472,23 @@ struct ceph_mds_reply_dirfrag { __le32 dist[]; } __attribute__ ((packed)); +#define CEPH_LOCK_FCNTL 1 +#define CEPH_LOCK_FLOCK 2 + +#define CEPH_LOCK_SHARED 1 +#define CEPH_LOCK_EXCL 2 +#define CEPH_LOCK_UNLOCK 4 + +struct ceph_filelock { + __le64 start;/* file offset to start lock at */ + __le64 length; /* num bytes to lock; 0 for all following start */ + __le64 client; /* which client holds the lock */ + __le64 pid; /* process id holding the lock on the client */ + __le64 pid_namespace; + __u8 type; /* shared lock, exclusive lock, or unlock */ +} __attribute__ ((packed)); + + /* file access modes */ #define CEPH_FILE_MODE_PIN 0 #define CEPH_FILE_MODE_RD 1 @@ -489,9 +517,10 @@ int ceph_flags_to_mode(int flags); #define CEPH_CAP_SAUTH 2 #define CEPH_CAP_SLINK 4 #define CEPH_CAP_SXATTR 6 -#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */ +#define CEPH_CAP_SFILE 8 +#define CEPH_CAP_SFLOCK 20 -#define CEPH_CAP_BITS 16 +#define CEPH_CAP_BITS 22 /* composed values */ #define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) @@ -509,6 +538,9 @@ int ceph_flags_to_mode(int flags); #define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE) #define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE) #define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE) +#define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK) +#define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK) + /* cap masks (for getattr) */ #define CEPH_STAT_CAP_INODE CEPH_CAP_PIN diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c index 0f943a0a3b5e..c6179d3a26a2 100644 --- a/fs/ceph/ceph_strings.c +++ b/fs/ceph/ceph_strings.c @@ -130,6 +130,8 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LSSNAP: return "lssnap"; case CEPH_MDS_OP_MKSNAP: return "mksnap"; case CEPH_MDS_OP_RMSNAP: return "rmsnap"; + case CEPH_MDS_OP_SETFILELOCK: return "setfilelock"; + case CEPH_MDS_OP_GETFILELOCK: return "getfilelock"; } return "???"; } From 40819f6fb227c1832935b775ac22aef10aa6f6dd Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Mon, 2 Aug 2010 15:34:23 -0700 Subject: [PATCH 34/39] ceph: add flock/fcntl lock support Implement flock inode operation to support advisory file locking. All lock/unlock operations are synchronous with the MDS. Lock state is sent when reconnecting to a recovering MDS to restore the shared lock state. Signed-off-by: Greg Farnum Signed-off-by: Sage Weil --- fs/ceph/Makefile | 2 +- fs/ceph/file.c | 2 + fs/ceph/locks.c | 256 +++++++++++++++++++++++++++++++++++++++++++ fs/ceph/mds_client.c | 18 ++- fs/ceph/super.h | 8 ++ 5 files changed, 284 insertions(+), 2 deletions(-) create mode 100644 fs/ceph/locks.c diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 6a660e610be8..278e1172600d 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -6,7 +6,7 @@ ifneq ($(KERNELRELEASE),) obj-$(CONFIG_CEPH_FS) += ceph.o -ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \ +ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ export.o caps.o snap.o xattr.o \ messenger.o msgpool.o buffer.o pagelist.o \ mds_client.o mdsmap.o \ diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 5fe50ffa2deb..e850e63f9563 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -938,6 +938,8 @@ const struct file_operations ceph_file_fops = { .aio_write = ceph_aio_write, .mmap = ceph_mmap, .fsync = ceph_fsync, + .lock = ceph_lock, + .flock = ceph_flock, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, .unlocked_ioctl = ceph_ioctl, diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c new file mode 100644 index 000000000000..ae85af06454f --- /dev/null +++ b/fs/ceph/locks.c @@ -0,0 +1,256 @@ +#include "ceph_debug.h" + +#include +#include + +#include "super.h" +#include "mds_client.h" +#include "pagelist.h" + +/** + * Implement fcntl and flock locking functions. + */ +static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, + u64 pid, u64 pid_ns, + int cmd, u64 start, u64 length, u8 wait) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_request *req; + int err; + + req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = igrab(inode); + + dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " + "length: %llu, wait: %d, type`: %d", (int)lock_type, + (int)operation, pid, start, length, wait, cmd); + + req->r_args.filelock_change.rule = lock_type; + req->r_args.filelock_change.type = cmd; + req->r_args.filelock_change.pid = cpu_to_le64(pid); + /* This should be adjusted, but I'm not sure if + namespaces actually get id numbers*/ + req->r_args.filelock_change.pid_namespace = + cpu_to_le64((u64)pid_ns); + req->r_args.filelock_change.start = cpu_to_le64(start); + req->r_args.filelock_change.length = cpu_to_le64(length); + req->r_args.filelock_change.wait = wait; + + err = ceph_mdsc_do_request(mdsc, inode, req); + ceph_mdsc_put_request(req); + dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " + "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type, + (int)operation, pid, start, length, wait, cmd, err); + return err; +} + +/** + * Attempt to set an fcntl lock. + * For now, this just goes away to the server. Later it may be more awesome. + */ +int ceph_lock(struct file *file, int cmd, struct file_lock *fl) +{ + u64 length; + u8 lock_cmd; + int err; + u8 wait = 0; + u16 op = CEPH_MDS_OP_SETFILELOCK; + + fl->fl_nspid = get_pid(task_tgid(current)); + dout("ceph_lock, fl_pid:%d", fl->fl_pid); + + /* set wait bit as appropriate, then make command as Ceph expects it*/ + if (F_SETLKW == cmd) + wait = 1; + if (F_GETLK == cmd) + op = CEPH_MDS_OP_GETFILELOCK; + + if (F_RDLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_SHARED; + else if (F_WRLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_EXCL; + else + lock_cmd = CEPH_LOCK_UNLOCK; + + if (LLONG_MAX == fl->fl_end) + length = 0; + else + length = fl->fl_end - fl->fl_start + 1; + + err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + (u64)fl->fl_pid, (u64)fl->fl_nspid, + lock_cmd, fl->fl_start, + length, wait); + if (!err) { + dout("mds locked, locking locally"); + err = posix_lock_file(file, fl, NULL); + if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { + /* undo! This should only happen if the kernel detects + * local deadlock. */ + ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + (u64)fl->fl_pid, (u64)fl->fl_nspid, + CEPH_LOCK_UNLOCK, fl->fl_start, + length, 0); + dout("got %d on posix_lock_file, undid lock", err); + } + } else { + dout("mds returned error code %d", err); + } + return err; +} + +int ceph_flock(struct file *file, int cmd, struct file_lock *fl) +{ + u64 length; + u8 lock_cmd; + int err; + u8 wait = 1; + + fl->fl_nspid = get_pid(task_tgid(current)); + dout("ceph_flock, fl_pid:%d", fl->fl_pid); + + /* set wait bit, then clear it out of cmd*/ + if (cmd & LOCK_NB) + wait = 0; + cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN); + /* set command sequence that Ceph wants to see: + shared lock, exclusive lock, or unlock */ + if (LOCK_SH == cmd) + lock_cmd = CEPH_LOCK_SHARED; + else if (LOCK_EX == cmd) + lock_cmd = CEPH_LOCK_EXCL; + else + lock_cmd = CEPH_LOCK_UNLOCK; + /* mds requires start and length rather than start and end */ + if (LLONG_MAX == fl->fl_end) + length = 0; + else + length = fl->fl_end - fl->fl_start + 1; + + err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, + file, (u64)fl->fl_pid, (u64)fl->fl_nspid, + lock_cmd, fl->fl_start, + length, wait); + if (!err) { + err = flock_lock_file_wait(file, fl); + if (err) { + ceph_lock_message(CEPH_LOCK_FLOCK, + CEPH_MDS_OP_SETFILELOCK, + file, (u64)fl->fl_pid, + (u64)fl->fl_nspid, + CEPH_LOCK_UNLOCK, fl->fl_start, + length, 0); + dout("got %d on flock_lock_file_wait, undid lock", err); + } + } else { + dout("mds error code %d", err); + } + return err; +} + +/** + * Must be called with BKL already held. Fills in the passed + * counter variables, so you can prepare pagelist metadata before calling + * ceph_encode_locks. + */ +void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) +{ + struct file_lock *lock; + + *fcntl_count = 0; + *flock_count = 0; + + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { + if (lock->fl_flags & FL_POSIX) + ++(*fcntl_count); + else if (lock->fl_flags & FL_FLOCK) + ++(*flock_count); + } + dout("counted %d flock locks and %d fcntl locks", + *flock_count, *fcntl_count); +} + +/** + * Encode the flock and fcntl locks for the given inode into the pagelist. + * Format is: #fcntl locks, sequential fcntl locks, #flock locks, + * sequential flock locks. + * Must be called with BLK already held, and the lock numbers should have + * been gathered under the same lock holding window. + */ +int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks) +{ + struct file_lock *lock; + struct ceph_filelock cephlock; + int err = 0; + + dout("encoding %d flock and %d fcntl locks", num_flock_locks, + num_fcntl_locks); + err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32)); + if (err) + goto fail; + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { + if (lock->fl_flags & FL_POSIX) { + err = lock_to_ceph_filelock(lock, &cephlock); + if (err) + goto fail; + err = ceph_pagelist_append(pagelist, &cephlock, + sizeof(struct ceph_filelock)); + } + if (err) + goto fail; + } + + err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32)); + if (err) + goto fail; + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { + if (lock->fl_flags & FL_FLOCK) { + err = lock_to_ceph_filelock(lock, &cephlock); + if (err) + goto fail; + err = ceph_pagelist_append(pagelist, &cephlock, + sizeof(struct ceph_filelock)); + } + if (err) + goto fail; + } +fail: + return err; +} + +/* + * Given a pointer to a lock, convert it to a ceph filelock + */ +int lock_to_ceph_filelock(struct file_lock *lock, + struct ceph_filelock *cephlock) +{ + int err = 0; + + cephlock->start = cpu_to_le64(lock->fl_start); + cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); + cephlock->client = cpu_to_le64(0); + cephlock->pid = cpu_to_le64(lock->fl_pid); + cephlock->pid_namespace = cpu_to_le64((u64)lock->fl_nspid); + + switch (lock->fl_type) { + case F_RDLCK: + cephlock->type = CEPH_LOCK_SHARED; + break; + case F_WRLCK: + cephlock->type = CEPH_LOCK_EXCL; + break; + case F_UNLCK: + cephlock->type = CEPH_LOCK_UNLOCK; + break; + default: + dout("Have unknown lock type %d", lock->fl_type); + err = -EINVAL; + } + + return err; +} diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 615f720a819d..9f0833e1631a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "mds_client.h" #include "mon_client.h" @@ -2335,7 +2336,22 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, } spin_unlock(&inode->i_lock); - err = ceph_pagelist_append(pagelist, &rec, reclen); + if (recon_state->flock) { + int num_fcntl_locks, num_flock_locks; + + lock_kernel(); + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); + rec.v2.flock_len = (2*sizeof(u32) + + (num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock)); + + err = ceph_pagelist_append(pagelist, &rec, reclen); + if (!err) + err = ceph_encode_locks(inode, pagelist, + num_fcntl_locks, + num_flock_locks); + unlock_kernel(); + } out: kfree(path); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 83f7cc5fab10..95adce441cc0 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -892,6 +892,14 @@ extern void ceph_debugfs_cleanup(void); extern int ceph_debugfs_client_init(struct ceph_client *client); extern void ceph_debugfs_client_cleanup(struct ceph_client *client); +/* locks.c */ +extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); +extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); +extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); +extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, + int p_locks, int f_locks); +extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); + static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) { if (dentry && dentry->d_parent) From 213c99ee0cf17ff0fbffb6fb540bd29615cd19d5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 3 Aug 2010 10:25:11 -0700 Subject: [PATCH 35/39] ceph: whitespace cleanup Signed-off-by: Sage Weil --- fs/ceph/addr.c | 14 +++++++++----- fs/ceph/debugfs.c | 17 +++++++++-------- fs/ceph/file.c | 2 +- fs/ceph/mds_client.c | 5 +++-- fs/ceph/messenger.c | 9 +++++---- fs/ceph/osd_client.c | 6 +++--- fs/ceph/osdmap.c | 2 +- 7 files changed, 31 insertions(+), 24 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e00797eed29c..5598a0d02295 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -309,7 +309,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, zero_user_segment(page, s, PAGE_CACHE_SIZE); } - if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) { + if (add_to_page_cache_lru(page, mapping, page->index, + GFP_NOFS)) { page_cache_release(page); dout("readpages %p add_to_page_cache failed %p\n", inode, page); @@ -797,9 +798,12 @@ get_more_pages: dout("%p will write page %p idx %lu\n", inode, page, page->index); - writeback_stat = atomic_long_inc_return(&client->writeback_count); - if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) { - set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); + writeback_stat = + atomic_long_inc_return(&client->writeback_count); + if (writeback_stat > CONGESTION_ON_THRESH( + client->mount_args->congestion_kb)) { + set_bdi_congested(&client->backing_dev_info, + BLK_RW_ASYNC); } set_page_writeback(page); @@ -1036,7 +1040,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, *pagep = page; dout("write_begin file %p inode %p page %p %d~%d\n", file, - inode, page, (int)pos, (int)len); + inode, page, (int)pos, (int)len); r = ceph_update_writeable_page(file, pos, len, page); } while (r == -EAGAIN); diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 968dc5a86c3e..360c4f22718d 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -291,7 +291,7 @@ static int dentry_lru_show(struct seq_file *s, void *ptr) return 0; } -#define DEFINE_SHOW_FUNC(name) \ +#define DEFINE_SHOW_FUNC(name) \ static int name##_open(struct inode *inode, struct file *file) \ { \ struct seq_file *sf; \ @@ -432,11 +432,12 @@ int ceph_debugfs_client_init(struct ceph_client *client) if (!client->debugfs_caps) goto out; - client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", - 0600, - client->debugfs_dir, - client, - &congestion_kb_fops); + client->debugfs_congestion_kb = + debugfs_create_file("writeback_congestion_kb", + 0600, + client->debugfs_dir, + client, + &congestion_kb_fops); if (!client->debugfs_congestion_kb) goto out; @@ -466,7 +467,7 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client) debugfs_remove(client->debugfs_dir); } -#else // CONFIG_DEBUG_FS +#else /* CONFIG_DEBUG_FS */ int __init ceph_debugfs_init(void) { @@ -486,4 +487,4 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client) { } -#endif // CONFIG_DEBUG_FS +#endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e850e63f9563..8c044a4f0457 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -665,7 +665,7 @@ more: * throw out any page cache pages in this range. this * may block. */ - truncate_inode_pages_range(inode->i_mapping, pos, + truncate_inode_pages_range(inode->i_mapping, pos, (pos+len) | (PAGE_CACHE_SIZE-1)); } else { pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 9f0833e1631a..a75ddbf9fe37 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1985,8 +1985,9 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) */ if (result == -ESTALE) { dout("got ESTALE on request %llu", req->r_tid); - if (!req->r_inode) ; //do nothing; not an authority problem - else if (req->r_direct_mode != USE_AUTH_MDS) { + if (!req->r_inode) { + /* do nothing; not an authority problem */ + } else if (req->r_direct_mode != USE_AUTH_MDS) { dout("not using auth, setting for that now"); req->r_direct_mode = USE_AUTH_MDS; __do_request(mdsc, req); diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 79b0995dc825..2502d76fcec1 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1302,8 +1302,8 @@ static void process_ack(struct ceph_connection *con) static int read_partial_message_section(struct ceph_connection *con, - struct kvec *section, unsigned int sec_len, - u32 *crc) + struct kvec *section, + unsigned int sec_len, u32 *crc) { int left; int ret; @@ -1434,7 +1434,8 @@ static int read_partial_message(struct ceph_connection *con) /* middle */ if (m->middle) { - ret = read_partial_message_section(con, &m->middle->vec, middle_len, + ret = read_partial_message_section(con, &m->middle->vec, + middle_len, &con->in_middle_crc); if (ret <= 0) return ret; @@ -1920,7 +1921,7 @@ out: /* * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. - */ + */ if (con->auth_retry && con->ops->invalidate_authorizer) { dout("calling invalidate_authorizer()\n"); con->ops->invalidate_authorizer(con); diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 707d2dbd8776..bed6391e52c7 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -1473,8 +1473,8 @@ static void put_osd_con(struct ceph_connection *con) * authentication */ static int get_authorizer(struct ceph_connection *con, - void **buf, int *len, int *proto, - void **reply_buf, int *reply_len, int force_new) + void **buf, int *len, int *proto, + void **reply_buf, int *reply_len, int force_new) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; @@ -1494,7 +1494,7 @@ static int get_authorizer(struct ceph_connection *con, &o->o_authorizer_reply_buf, &o->o_authorizer_reply_buf_len); if (ret) - return ret; + return ret; } *proto = ac->protocol; diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index e94c6fb2e2a4..e31f118f1392 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -855,7 +855,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, node)->pgid, pgid) <= 0) { struct ceph_pg_mapping *cur = rb_entry(rbp, struct ceph_pg_mapping, node); - + rbp = rb_next(rbp); dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); rb_erase(&cur->node, &map->pg_temp); From 52dfb8ac0ef41168c1a10590b7259a5ab1cd2ab7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 3 Aug 2010 10:25:30 -0700 Subject: [PATCH 36/39] ceph: constify dentry_operations This makes checkpatch happy. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 8 ++++---- fs/ceph/super.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 0ec6e67533ef..67bbb41d5526 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -27,7 +27,7 @@ const struct inode_operations ceph_dir_iops; const struct file_operations ceph_dir_fops; -struct dentry_operations ceph_dentry_ops; +const struct dentry_operations ceph_dentry_ops; /* * Initialize ceph dentry state. @@ -1241,16 +1241,16 @@ const struct inode_operations ceph_dir_iops = { .create = ceph_create, }; -struct dentry_operations ceph_dentry_ops = { +const struct dentry_operations ceph_dentry_ops = { .d_revalidate = ceph_d_revalidate, .d_release = ceph_dentry_release, }; -struct dentry_operations ceph_snapdir_dentry_ops = { +const struct dentry_operations ceph_snapdir_dentry_ops = { .d_revalidate = ceph_snapdir_d_revalidate, .d_release = ceph_dentry_release, }; -struct dentry_operations ceph_snap_dentry_ops = { +const struct dentry_operations ceph_snap_dentry_ops = { .d_release = ceph_dentry_release, }; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 95adce441cc0..2482d696f0de 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -861,7 +861,7 @@ extern void ceph_release_page_vector(struct page **pages, int num_pages); /* dir.c */ extern const struct file_operations ceph_dir_fops; extern const struct inode_operations ceph_dir_iops; -extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, +extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); From e9d177443134bc4ac1c1393af69e2a8704bcac09 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 2 Aug 2010 16:23:49 -0700 Subject: [PATCH 37/39] ceph: do not ignore osd_idle_ttl mount option Actually apply the mount option to the mount_args struct. Signed-off-by: Sage Weil --- fs/ceph/super.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 7f751f2850ba..9922628532b2 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -548,6 +548,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, case Opt_osdkeepalivetimeout: args->osd_keepalive_timeout = intval; break; + case Opt_osd_idle_ttl: + args->osd_idle_ttl = intval; + break; case Opt_mount_timeout: args->mount_timeout = intval; break; From 0eb6cd49f6e3ec523787d09cf08d3179be270db4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 5 Aug 2010 13:53:18 -0700 Subject: [PATCH 38/39] ceph: only queue async writeback on cap revocation if there is dirty data Normally, if the Fb cap bit is being revoked, we queue an async writeback. If there is no dirty data but we still hold the cap, this leaves the client sitting around doing nothing until the cap timeouts expire and the cap is released on its own (as it would have been without the revocation). Instead, only queue writeback if the bit is actually used (i.e., we have dirty data). If not, we can reply to the revocation immediately. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 51546d54b841..7bf182b03973 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2384,7 +2384,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_cap_string(cap->issued), ceph_cap_string(newcaps), ceph_cap_string(revoking)); - if (revoking & CEPH_CAP_FILE_BUFFER) + if (revoking & used & CEPH_CAP_FILE_BUFFER) writeback = 1; /* initiate writeback; will delay ack */ else if (revoking == CEPH_CAP_FILE_CACHE && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && From e56fa10e92e077d456cbc33b7025032887772b33 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Mon, 17 May 2010 12:40:28 -0700 Subject: [PATCH 39/39] ceph: generalize mon requests, add pool op support Generalize the current statfs synchronous requests, and support pool_ops. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/mon_client.c | 180 +++++++++++++++++++++++++++++++++++++------ fs/ceph/mon_client.h | 5 ++ 2 files changed, 163 insertions(+), 22 deletions(-) diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index 54fe01c50706..b2a5a3e4a671 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -349,7 +349,7 @@ out: } /* - * statfs + * generic requests (e.g., statfs, poolop) */ static struct ceph_mon_generic_request *__lookup_generic_req( struct ceph_mon_client *monc, u64 tid) @@ -442,6 +442,35 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con, return m; } +static int do_generic_request(struct ceph_mon_client *monc, + struct ceph_mon_generic_request *req) +{ + int err; + + /* register request */ + mutex_lock(&monc->mutex); + req->tid = ++monc->last_tid; + req->request->hdr.tid = cpu_to_le64(req->tid); + __insert_generic_request(monc, req); + monc->num_generic_requests++; + ceph_con_send(monc->con, ceph_msg_get(req->request)); + mutex_unlock(&monc->mutex); + + err = wait_for_completion_interruptible(&req->completion); + + mutex_lock(&monc->mutex); + rb_erase(&req->node, &monc->generic_request_tree); + monc->num_generic_requests--; + mutex_unlock(&monc->mutex); + + if (!err) + err = req->result; + return err; +} + +/* + * statfs + */ static void handle_statfs_reply(struct ceph_mon_client *monc, struct ceph_msg *msg) { @@ -468,7 +497,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc, return; bad: - pr_err("corrupt generic reply, no tid\n"); + pr_err("corrupt generic reply, tid %llu\n", tid); ceph_msg_dump(msg); } @@ -487,6 +516,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) kref_init(&req->kref); req->buf = buf; + req->buf_len = sizeof(*buf); init_completion(&req->completion); err = -ENOMEM; @@ -504,25 +534,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) h->monhdr.session_mon_tid = 0; h->fsid = monc->monmap->fsid; - /* register request */ - mutex_lock(&monc->mutex); - req->tid = ++monc->last_tid; - req->request->hdr.tid = cpu_to_le64(req->tid); - __insert_generic_request(monc, req); - monc->num_generic_requests++; - mutex_unlock(&monc->mutex); - - /* send request and wait */ - ceph_con_send(monc->con, ceph_msg_get(req->request)); - err = wait_for_completion_interruptible(&req->completion); - - mutex_lock(&monc->mutex); - rb_erase(&req->node, &monc->generic_request_tree); - monc->num_generic_requests--; - mutex_unlock(&monc->mutex); - - if (!err) - err = req->result; + err = do_generic_request(monc, req); out: kref_put(&req->kref, release_generic_request); @@ -530,7 +542,126 @@ out: } /* - * Resend pending statfs requests. + * pool ops + */ +static int get_poolop_reply_buf(const char *src, size_t src_len, + char *dst, size_t dst_len) +{ + u32 buf_len; + + if (src_len != sizeof(u32) + dst_len) + return -EINVAL; + + buf_len = le32_to_cpu(*(u32 *)src); + if (buf_len != dst_len) + return -EINVAL; + + memcpy(dst, src + sizeof(u32), dst_len); + return 0; +} + +static void handle_poolop_reply(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + struct ceph_mon_generic_request *req; + struct ceph_mon_poolop_reply *reply = msg->front.iov_base; + u64 tid = le64_to_cpu(msg->hdr.tid); + + if (msg->front.iov_len < sizeof(*reply)) + goto bad; + dout("handle_poolop_reply %p tid %llu\n", msg, tid); + + mutex_lock(&monc->mutex); + req = __lookup_generic_req(monc, tid); + if (req) { + if (req->buf_len && + get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply), + msg->front.iov_len - sizeof(*reply), + req->buf, req->buf_len) < 0) { + mutex_unlock(&monc->mutex); + goto bad; + } + req->result = le32_to_cpu(reply->reply_code); + get_generic_request(req); + } + mutex_unlock(&monc->mutex); + if (req) { + complete(&req->completion); + put_generic_request(req); + } + return; + +bad: + pr_err("corrupt generic reply, tid %llu\n", tid); + ceph_msg_dump(msg); +} + +/* + * Do a synchronous pool op. + */ +int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op, + u32 pool, u64 snapid, + char *buf, int len) +{ + struct ceph_mon_generic_request *req; + struct ceph_mon_poolop *h; + int err; + + req = kzalloc(sizeof(*req), GFP_NOFS); + if (!req) + return -ENOMEM; + + kref_init(&req->kref); + req->buf = buf; + req->buf_len = len; + init_completion(&req->completion); + + err = -ENOMEM; + req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS); + if (!req->request) + goto out; + req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS); + if (!req->reply) + goto out; + + /* fill out request */ + req->request->hdr.version = cpu_to_le16(2); + h = req->request->front.iov_base; + h->monhdr.have_version = 0; + h->monhdr.session_mon = cpu_to_le16(-1); + h->monhdr.session_mon_tid = 0; + h->fsid = monc->monmap->fsid; + h->pool = cpu_to_le32(pool); + h->op = cpu_to_le32(op); + h->auid = 0; + h->snapid = cpu_to_le64(snapid); + h->name_len = 0; + + err = do_generic_request(monc, req); + +out: + kref_put(&req->kref, release_generic_request); + return err; +} + +int ceph_monc_create_snapid(struct ceph_mon_client *monc, + u32 pool, u64 *snapid) +{ + return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, + pool, 0, (char *)snapid, sizeof(*snapid)); + +} + +int ceph_monc_delete_snapid(struct ceph_mon_client *monc, + u32 pool, u64 snapid) +{ + return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, + pool, snapid, 0, 0); + +} + +/* + * Resend pending generic requests. */ static void __resend_generic_request(struct ceph_mon_client *monc) { @@ -783,6 +914,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) handle_statfs_reply(monc, msg); break; + case CEPH_MSG_POOLOP_REPLY: + handle_poolop_reply(monc, msg); + break; + case CEPH_MSG_MON_MAP: ceph_monc_handle_map(monc, msg); break; @@ -820,6 +955,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, case CEPH_MSG_MON_SUBSCRIBE_ACK: m = ceph_msg_get(monc->m_subscribe_ack); break; + case CEPH_MSG_POOLOP_REPLY: case CEPH_MSG_STATFS_REPLY: return get_generic_reply(con, hdr, skip); case CEPH_MSG_AUTH_REPLY: diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h index 174d794321d0..8e396f2c0963 100644 --- a/fs/ceph/mon_client.h +++ b/fs/ceph/mon_client.h @@ -50,6 +50,7 @@ struct ceph_mon_generic_request { struct rb_node node; int result; void *buf; + int buf_len; struct completion completion; struct ceph_msg *request; /* original request */ struct ceph_msg *reply; /* and reply */ @@ -111,6 +112,10 @@ extern int ceph_monc_open_session(struct ceph_mon_client *monc); extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); +extern int ceph_monc_create_snapid(struct ceph_mon_client *monc, + u32 pool, u64 *snapid); +extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc, + u32 pool, u64 snapid); #endif